import numpy as np
from PIL import Image
Day 4 - Naive matmul+exercises
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
cuda.init()
= cuda.Device(0)
device
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
print(f"Device:\t{device.name()}")
Cuda version: 12.8.0
Device: NVIDIA GeForce RTX 3080 Laptop GPU
from pathlib import Path
#include <stdint.h>
#include <stdio.h>
__global__ void matmul_f32(float *m1, float *m2, float *res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t ) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
double out;
if (x < out_shape_1 && y < out_shape_0) {
out = 0;
for (int i = 0; i < inner_dim; i++) {
out += m1[y*m1_width + i] * m2[i*m2_width + x];
}
res[y*out_shape_1 + x] = out;
}
}
from lovely_numpy import Lo
= np.random.randn(100, 200).astype(np.float32)
m1 = np.random.randn(200, 300).astype(np.float32)
m2
= np.matmul(m1, m2)
np_res Lo(np_res)
array[100, 300] f32 n=30000 (0.1Mb) x∈[-63.416, 56.938] μ=-0.039 σ=14.187
= 32
BLOCK_SIZE_X = 32
BLOCK_SIZE_Y
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])
= (m1.shape[0], m2.shape[1])
out_shape
try:
= device.make_context()
ctx
= SourceModule(Path("day_04_matmul.cu").read_text(),
mod =[
options'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
= mod.get_function("matmul_f32")
matmul_f32
= cuda.mem_alloc_like(m1)
gpu_m1 = cuda.mem_alloc_like(m2)
gpu_m2
= np.empty(out_shape, dtype=np.float32)
res
= cuda.mem_alloc_like(res)
gpu_res
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
= (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
block_size = (
grid_size 1] + BLOCK_SIZE_X - 1) // BLOCK_SIZE_X),
((out_shape[0] + BLOCK_SIZE_Y - 1) // BLOCK_SIZE_Y),
((out_shape[1
)
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
matmul_f32(gpu_m1, gpu_m2, gpu_res, np.uint32(out_shape[
ctx.synchronize()
cuda.memcpy_dtoh(res, gpu_res)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res)
Grid size: (10, 4, 1)
Block size: (32, 32, 1)
Restul dimensions: 100x300
Total threads: 40960
array[100, 300] f32 n=30000 (0.1Mb) x∈[-63.416, 56.938] μ=-0.039 σ=14.187
0,1],m2[1,0] m1[
(np.float32(0.24507281), np.float32(-1.3683434))
all() np.isclose(res, np_res).
np.False_
Lo(np.isclose(res, np_res)).chans
Looks like matmul is very succeptible to numerical instability.
Since we are adding numbers to the accumulator over and over, if the accumulated value gets large enough, it will lose precision to correctly accumulate small values. If it then gets a large update opposite of accumulated value and becomes small again, those errors will become very significant.
But I think overall it’s correct. I changed to accumulator to be double, and still seeting the discrepancy. It’s possible that numpy matmul also not not very precise.
Let’s do the exercises
- In this chapter we implemented a matrix multiplication kernel that has each thread produce one output matrix element. In this question, you will implement different matrix-matrix multiplication kernels and compare them.
- Write a kernel that has each thread produce one output matrix row. Fill in the execution configuration parameters for the design.
- Write a kernel that has each thread produce one output matrix column. Fill in the execution configuration parameters for the design.
- Analyze the pros and cons of each of the two kernel designs.
#include <stdint.h>
#include <stdio.h>
__global__ void matmul_f32(float* m1, float* m2, float* res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
double out;
if (x < out_shape_1 && y < out_shape_0) {
out = 0;
for (int i = 0; i < inner_dim; i++) {
out += m1[y * m1_width + i] * m2[i * m2_width + x];
}
res[y * out_shape_1 + x] = out;
}
}
__global__ void matmul_f32_row(float* m1, float* m2, float* res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t)
{
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
if (y < out_shape_0) {
for (int x = 0; x < out_shape_1; x++) {
double out = 0;
for (int i = 0; i < inner_dim; i++) {
out += m1[y * m1_width + i] * m2[i * m2_width + x];
}
res[y * out_shape_1 + x] = out;
}
}
}
__global__ void matmul_f32_col(float* m1, float* m2, float* res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int m1_width = inner_dim;
int m2_width = out_shape_1;
if (x < out_shape_1) {
for (int y = 0; y < out_shape_1; y++) {
double out = 0;
for (int i = 0; i < inner_dim; i++) {
out += m1[y * m1_width + i] * m2[i * m2_width + x];
}
res[y * out_shape_1 + x] = out;
}
}
}
= 1
BLOCK_SIZE_X = 32
BLOCK_SIZE_Y
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])
= (m1.shape[0], m2.shape[1])
out_shape
try:
= device.make_context()
ctx
= SourceModule(Path("day_04_matmul_row_col.cu").read_text(),
mod =[
options'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
= mod.get_function("matmul_f32_row")
matmul_f32_row
= cuda.mem_alloc_like(m1)
gpu_m1 = cuda.mem_alloc_like(m2)
gpu_m2
= np.empty(out_shape, dtype=np.float32)
res_row
= cuda.mem_alloc_like(res_row)
gpu_res_row
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
= (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
block_size = (1, ((out_shape[0] + BLOCK_SIZE_Y - 1) // BLOCK_SIZE_Y), 1)
grid_size
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
matmul_f32_row(gpu_m1, gpu_m2, gpu_res_row, np.uint32(out_shape[# ctx.synchronize()
cuda.memcpy_dtoh(res_row, gpu_res_row)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res_row)
Grid size: (1, 4, 1)
Block size: (1, 32, 1)
Restul dimensions: 100x300
Total threads: 128
array[100, 300] f32 n=30000 (0.1Mb) x∈[-63.416, 56.938] μ=-0.039 σ=14.187
Lo(res)
array[100, 300] f32 n=30000 (0.1Mb) x∈[-63.416, 56.938] μ=-0.039 σ=14.187
== res_row).all() (res
np.True_
Looks good, let’s try with the column.
= 32
BLOCK_SIZE_X = 1
BLOCK_SIZE_Y
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])
= (m1.shape[0], m2.shape[1])
out_shape
try:
= device.make_context()
ctx
= SourceModule(Path("day_04_matmul_row_col.cu").read_text(),
mod =[
options'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
= mod.get_function("matmul_f32_col")
matmul_f32_col
= cuda.mem_alloc_like(m1)
gpu_m1 = cuda.mem_alloc_like(m2)
gpu_m2
= np.empty(out_shape, dtype=np.float32)
res_col
= cuda.mem_alloc_like(res_col)
gpu_res_col
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
= (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
block_size = (((out_shape[1] + BLOCK_SIZE_X - 1) // BLOCK_SIZE_X),1,1)
grid_size
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
matmul_f32_col(gpu_m1, gpu_m2, gpu_res_col, np.uint32(out_shape[
ctx.synchronize()
cuda.memcpy_dtoh(res_col, gpu_res_col)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res_col)
Grid size: (10, 1, 1)
Block size: (32, 1, 1)
Restul dimensions: 100x300
Total threads: 320
array[100, 300] f32 n=30000 (0.1Mb) x∈[-63.416, 56.938] μ=-0.039 σ=14.187
== res).all() (res_col
np.True_
- Analyze the pros and cons of each of the two kernel designs.
They both suck, but because we are not using enough threads to saturate the GPU.
Row possibly sucks less because the cache is shared between thread blocks.