import numpy as np
from PIL import ImageDay 4 - Naive matmul+exercises
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
cuda.init()
device = cuda.Device(0)
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
print(f"Device:\t{device.name()}")Cuda version: 12.8.0
Device: NVIDIA GeForce RTX 3080 Laptop GPU
from pathlib import Pathcu_file = "kernels/matmul/matmul.cu"Naive matmul kernels/matmul/matmul.cu
#include <stdint.h>
#include <stdio.h>
__global__ void matmul_f32(float *m1, float *m2, float *res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t ) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
double out;
if (x < out_shape_1 && y < out_shape_0) {
out = 0;
for (int i = 0; i < inner_dim; i++) {
out += m1[y*m1_width + i] * m2[i*m2_width + x];
}
res[y*out_shape_1 + x] = out;
}
}
from lovely_numpy import Lom1 = np.random.randn(100, 200).astype(np.float32)
m2 = np.random.randn(200, 300).astype(np.float32)
np_res = np.matmul(m1, m2)
Lo(np_res)array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
Testing naive matnul
BLOCK_SIZE_X = 32
BLOCK_SIZE_Y = 32
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])
out_shape = (m1.shape[0], m2.shape[1])
try:
ctx = device.make_context()
mod = SourceModule(Path(cu_file).read_text(),
options=[
'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
matmul_f32 = mod.get_function("matmul_f32")
gpu_m1 = cuda.mem_alloc_like(m1)
gpu_m2 = cuda.mem_alloc_like(m2)
res = np.empty(out_shape, dtype=np.float32)
gpu_res = cuda.mem_alloc_like(res)
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
block_size = (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
grid_size = (
((out_shape[1] + BLOCK_SIZE_X - 1) // BLOCK_SIZE_X),
((out_shape[0] + BLOCK_SIZE_Y - 1) // BLOCK_SIZE_Y),
1
)
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
matmul_f32(gpu_m1, gpu_m2, gpu_res, np.uint32(out_shape[0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
ctx.synchronize()
cuda.memcpy_dtoh(res, gpu_res)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res)Grid size: (10, 4, 1)
Block size: (32, 32, 1)
Restul dimensions: 100x300
Total threads: 40960
array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
m1[0,1],m2[1,0](np.float32(-0.9388011), np.float32(0.16297509))
np.isclose(res, np_res).all()np.False_
Lo(np.isclose(res, np_res)).chansNunmeric stability
Looks like matmul is very succeptible to numerical instability. > Since we are adding numbers to the accumulator over and over, if the accumulated value gets large enough, > it will lose precision to correctly accumulate small values. If it then gets a large update opposite of accumulated value > and becomes small again, those errors will become very significant.
But I think overall it’s correct. I changed to accumulator to be double, and still seeting the discrepancy. It’s possible that numpy matmul also not not very precise.
Exercises
Let’s do the exercises
- In this chapter we implemented a matrix multiplication kernel that has each thread produce one output matrix element. In this question, you will implement different matrix-matrix multiplication kernels and compare them.
- Write a kernel that has each thread produce one output matrix row. Fill in the execution configuration parameters for the design.
- Write a kernel that has each thread produce one output matrix column. Fill in the execution configuration parameters for the design.
- Analyze the pros and cons of each of the two kernel designs.
cu_file2 = "kernels/matmul/matmul-row_col.cu"One thread per row/col:
Naive matmul row/col kernels/matmul/matmul-row_col.cu
#include <stdint.h>
#include <stdio.h>
__global__ void matmul_f32(float* m1, float* m2, float* res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
double out;
if (x < out_shape_1 && y < out_shape_0) {
out = 0;
for (int i = 0; i < inner_dim; i++) {
out += m1[y * m1_width + i] * m2[i * m2_width + x];
}
res[y * out_shape_1 + x] = out;
}
}
__global__ void matmul_f32_row(float* m1, float* m2, float* res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t)
{
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
if (y < out_shape_0) {
for (int x = 0; x < out_shape_1; x++) {
double out = 0;
for (int i = 0; i < inner_dim; i++) {
out += m1[y * m1_width + i] * m2[i * m2_width + x];
}
res[y * out_shape_1 + x] = out;
}
}
}
__global__ void matmul_f32_col(float* m1, float* m2, float* res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int m1_width = inner_dim;
int m2_width = out_shape_1;
if (x < out_shape_1) {
for (int y = 0; y < out_shape_1; y++) {
double out = 0;
for (int i = 0; i < inner_dim; i++) {
out += m1[y * m1_width + i] * m2[i * m2_width + x];
}
res[y * out_shape_1 + x] = out;
}
}
}Testing the thread per row matmul
BLOCK_SIZE_X = 1
BLOCK_SIZE_Y = 32
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])
out_shape = (m1.shape[0], m2.shape[1])
try:
ctx = device.make_context()
mod = SourceModule(Path(cu_file2).read_text(),
options=[
'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
matmul_f32_row = mod.get_function("matmul_f32_row")
gpu_m1 = cuda.mem_alloc_like(m1)
gpu_m2 = cuda.mem_alloc_like(m2)
res_row = np.empty(out_shape, dtype=np.float32)
gpu_res_row = cuda.mem_alloc_like(res_row)
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
block_size = (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
grid_size = (1, ((out_shape[0] + BLOCK_SIZE_Y - 1) // BLOCK_SIZE_Y), 1)
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
matmul_f32_row(gpu_m1, gpu_m2, gpu_res_row, np.uint32(out_shape[0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
# ctx.synchronize()
cuda.memcpy_dtoh(res_row, gpu_res_row)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res_row)Grid size: (1, 4, 1)
Block size: (1, 32, 1)
Restul dimensions: 100x300
Total threads: 128
array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
Lo(res)array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
(res == res_row).all()np.True_
Testing one thread per col matnul
BLOCK_SIZE_X = 32
BLOCK_SIZE_Y = 1
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])
out_shape = (m1.shape[0], m2.shape[1])
try:
ctx = device.make_context()
mod = SourceModule(Path(cu_file2).read_text(),
options=[
'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
matmul_f32_col = mod.get_function("matmul_f32_col")
gpu_m1 = cuda.mem_alloc_like(m1)
gpu_m2 = cuda.mem_alloc_like(m2)
res_col = np.empty(out_shape, dtype=np.float32)
gpu_res_col = cuda.mem_alloc_like(res_col)
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
block_size = (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
grid_size = (((out_shape[1] + BLOCK_SIZE_X - 1) // BLOCK_SIZE_X),1,1)
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
matmul_f32_col(gpu_m1, gpu_m2, gpu_res_col, np.uint32(out_shape[0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
ctx.synchronize()
cuda.memcpy_dtoh(res_col, gpu_res_col)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res_col)Grid size: (10, 1, 1)
Block size: (32, 1, 1)
Restul dimensions: 100x300
Total threads: 320
array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
(res_col == res).all()np.True_
- Analyze the pros and cons of each of the two kernel designs.
They both suck, but because we are not using enough threads to saturate the GPU.
Row possibly sucks less because the cache is shared between thread blocks.