from pathlib import Path
Day 1 - playing with nvcc
src/hello.cu
#include <stdio.h>
#include <cuda_runtime.h>
#ifndef N_THREADS
#define N_THREADS 512
#endif
__global__ void testKernel(float *a, float *b, float *c, uint n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n) {
c[i] = a[i]+ b[i];
}
}
void vecAdd_f32(float *A, float *B, float *C, uint n) {
float *A_d, *B_d, *C_d;
int size = n * sizeof(float);
cudaMalloc((void **) &A_d, size);
cudaMalloc((void **) &B_d, size);
cudaMalloc((void **) &C_d, size);
cudaMemcpy(A_d, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B, size, cudaMemcpyHostToDevice);
testKernel <<<(n + N_THREADS - 1) / N_THREADS, N_THREADS>>>(A_d, B_d, C_d, n);
cudaMemcpy(C, C_d, size, cudaMemcpyDeviceToHost);
cudaFree(A_d);
cudaFree(B_d);
cudaFree(C_d);
}
void add_vectors_cpu(float *a, float *b, float *c, uint n) {
for (uint i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
int verify_equal(float *a, float *b, uint n)
{
for (uint i = 0; i < n; i++) {
if (a[i] != b[i]) return 0;
}
return 1;
}
int main() {
uint n = 1024*1024;
float *A = (float*)malloc(n * sizeof(float));
float *B = (float*)malloc(n * sizeof(float));
float *C = (float*)malloc(n * sizeof(float));
for(uint i = 0; i < n; i++) {
A[i] = (float)rand() / RAND_MAX;
B[i] = (float)rand() / RAND_MAX;
}
vecAdd_f32(A, B, C, n);
float *C_cpu = (float *)malloc(n * sizeof(float));
add_vectors_cpu(A, B, C_cpu, n);
printf("Do they match? %s!\n", verify_equal(C, C_cpu, n) ? "Yes" : "No" );
return 0;
}
Makefile (ai-generated)
# Compiler and flags
NVCC := nvcc
CUDA_ARCH := -arch=sm_60 -gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_75,code=sm_75
# Build flags
NVCC_FLAGS := -std=c++14 -O3 $(CUDA_ARCH)
WARN_FLAGS := -Xcompiler -Wall,-Wextra
DEBUG_FLAGS := -g -G -lineinfo
# Build directories
BUILD_DIR := build
SRC_DIR := src
# Default target name
TARGET := hello
# Source files
SRCS := $(wildcard $(SRC_DIR)/*.cu)
OBJS := $(SRCS:$(SRC_DIR)/%.cu=$(BUILD_DIR)/%.o)
# Build targets
all: release
debug: NVCC_FLAGS += $(DEBUG_FLAGS)
debug: $(BUILD_DIR)/$(TARGET)
release: $(BUILD_DIR)/$(TARGET)
$(BUILD_DIR)/$(TARGET): $(OBJS)
mkdir -p $(@D)
$(NVCC) $(NVCC_FLAGS) $(WARN_FLAGS) $^ -o $@
$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cu
mkdir -p $(@D)
$(NVCC) $(NVCC_FLAGS) $(WARN_FLAGS) -c $< -o $@
clean:
rm -rf $(BUILD_DIR)
.PHONY: all debug release clean