cuda-100
  1. Day 1 - playing with nvcc
  • 100 days of CUDA
  • Day 0 - playing with PyCUDA
  • Day 1 - playing with nvcc
  • Day 2 - RGB to grayscale
  • Day 3 - RGB blur
  • Day 4 - Naive matmul+exercises
  • Day 5 - Matrix-vector multiplication
  • Day 6 - Tiled matmul
  • Day 7 - Tiled matmul experiments
  • Day 8 - Thread coarsening
  • Day 9 - Conv 2D
  • Day 10 - Improving Conv2d performance
  • Day 11 - conv2d with shared memory
  • Day 12 - conv2d with shared memory and halo

On this page

  • day_01_nvcc/src/hello.cu
  • day_01_nvcc/Makefile

Day 1 - playing with nvcc

from pathlib import Path
cu_file = "day_01_nvcc/src/hello.cu"

day_01_nvcc/src/hello.cu

#include <stdio.h>
#include <cuda_runtime.h>


#ifndef N_THREADS
    #define N_THREADS 512
#endif


__global__ void testKernel(float *a, float *b, float *c, uint n)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < n) {
        c[i] = a[i]+ b[i];
    }

}


void vecAdd_f32(float *A, float *B, float *C, uint n) {
    float *A_d, *B_d, *C_d;
    int size = n * sizeof(float);

    cudaMalloc((void **) &A_d, size);
    cudaMalloc((void **) &B_d, size);
    cudaMalloc((void **) &C_d, size);


    cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);

    testKernel <<<(n + N_THREADS - 1) / N_THREADS, N_THREADS>>>(A_d, B_d, C_d, n);

    cudaMemcpy(C, C_d, size, cudaMemcpyDeviceToHost);

    cudaFree(A_d);
    cudaFree(B_d);
    cudaFree(C_d);
}



void add_vectors_cpu(float *a, float *b, float *c, uint n) {
    for (uint i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

int verify_equal(float *a, float *b, uint n)
{
    for (uint i = 0; i < n; i++) {
        if (a[i] != b[i]) return 0;
    }
    return 1;

}


int main() {
    uint n = 1024*1024;

    float *A = (float*)malloc(n * sizeof(float));
    float *B = (float*)malloc(n * sizeof(float));
    float *C = (float*)malloc(n * sizeof(float));


    for(uint i = 0; i < n; i++) {
        A[i] = (float)rand() / RAND_MAX;
        B[i] = (float)rand() / RAND_MAX;
    }

    vecAdd_f32(A, B, C, n);

    float *C_cpu = (float *)malloc(n * sizeof(float));

    add_vectors_cpu(A, B, C_cpu, n);

    printf("Do they match? %s!\n", verify_equal(C, C_cpu, n) ? "Yes" : "No" );

    return 0;
}
make_file = "day_01_nvcc/Makefile"

day_01_nvcc/Makefile

# Compiler and flags
NVCC        := nvcc
CUDA_ARCH   := -arch=sm_60 -gencode=arch=compute_60,code=sm_60 \
               -gencode=arch=compute_70,code=sm_70 \
               -gencode=arch=compute_75,code=sm_75

# Build flags
NVCC_FLAGS  := -std=c++14 -O3 $(CUDA_ARCH)
WARN_FLAGS  := -Xcompiler -Wall,-Wextra
DEBUG_FLAGS := -g -G -lineinfo

# Build directories
BUILD_DIR   := build
SRC_DIR     := src

# Default target name
TARGET      := hello

# Source files
SRCS        := $(wildcard $(SRC_DIR)/*.cu)
OBJS        := $(SRCS:$(SRC_DIR)/%.cu=$(BUILD_DIR)/%.o)

# Build targets
all: release

debug: NVCC_FLAGS += $(DEBUG_FLAGS)
debug: $(BUILD_DIR)/$(TARGET)

release: $(BUILD_DIR)/$(TARGET)

$(BUILD_DIR)/$(TARGET): $(OBJS)
    mkdir -p $(@D)
    $(NVCC) $(NVCC_FLAGS) $(WARN_FLAGS) $^ -o $@

$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cu
    mkdir -p $(@D)
    $(NVCC) $(NVCC_FLAGS) $(WARN_FLAGS) -c $< -o $@

clean:
    rm -rf $(BUILD_DIR)

.PHONY: all debug release clean