import numpy as np
from PIL import Image
Day 4 - Naive matmul+exercises
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
cuda.init()
= cuda.Device(0)
device
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
print(f"Device:\t{device.name()}")
Cuda version: 12.8.0
Device: NVIDIA GeForce RTX 3080 Laptop GPU
from pathlib import Path
= "kernels/matmul/matmul.cu" cu_file
Naive matmul kernels/matmul/matmul.cu
#include <stdint.h>
#include <stdio.h>
void matmul_f32(float *m1, float *m2, float *res,
__global__ uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t ) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
double out;
if (x < out_shape_1 && y < out_shape_0) {
= 0;
out for (int i = 0; i < inner_dim; i++) {
+= m1[y*m1_width + i] * m2[i*m2_width + x];
out }
[y*out_shape_1 + x] = out;
res}
}
from lovely_numpy import Lo
= np.random.randn(100, 200).astype(np.float32)
m1 = np.random.randn(200, 300).astype(np.float32)
m2
= np.matmul(m1, m2)
np_res Lo(np_res)
array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
Testing naive matnul
= 32
BLOCK_SIZE_X = 32
BLOCK_SIZE_Y
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])
= (m1.shape[0], m2.shape[1])
out_shape
try:
= device.make_context()
ctx
= SourceModule(Path(cu_file).read_text(),
mod =[
options'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
= mod.get_function("matmul_f32")
matmul_f32
= cuda.mem_alloc_like(m1)
gpu_m1 = cuda.mem_alloc_like(m2)
gpu_m2
= np.empty(out_shape, dtype=np.float32)
res
= cuda.mem_alloc_like(res)
gpu_res
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
= (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
block_size = (
grid_size 1] + BLOCK_SIZE_X - 1) // BLOCK_SIZE_X),
((out_shape[0] + BLOCK_SIZE_Y - 1) // BLOCK_SIZE_Y),
((out_shape[1
)
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
matmul_f32(gpu_m1, gpu_m2, gpu_res, np.uint32(out_shape[
ctx.synchronize()
cuda.memcpy_dtoh(res, gpu_res)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res)
Grid size: (10, 4, 1)
Block size: (32, 32, 1)
Restul dimensions: 100x300
Total threads: 40960
array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
0,1],m2[1,0] m1[
(np.float32(-0.9388011), np.float32(0.16297509))
all() np.isclose(res, np_res).
np.False_
Lo(np.isclose(res, np_res)).chans
Nunmeric stability
Looks like matmul is very succeptible to numerical instability. > Since we are adding numbers to the accumulator over and over, if the accumulated value gets large enough, > it will lose precision to correctly accumulate small values. If it then gets a large update opposite of accumulated value > and becomes small again, those errors will become very significant.
But I think overall it’s correct. I changed to accumulator to be double, and still seeting the discrepancy. It’s possible that numpy matmul also not not very precise.
Exercises
Let’s do the exercises
- In this chapter we implemented a matrix multiplication kernel that has each thread produce one output matrix element. In this question, you will implement different matrix-matrix multiplication kernels and compare them.
- Write a kernel that has each thread produce one output matrix row. Fill in the execution configuration parameters for the design.
- Write a kernel that has each thread produce one output matrix column. Fill in the execution configuration parameters for the design.
- Analyze the pros and cons of each of the two kernel designs.
= "kernels/matmul/matmul-row_col.cu" cu_file2
One thread per row/col:
Naive matmul row/col kernels/matmul/matmul-row_col.cu
#include <stdint.h>
#include <stdio.h>
void matmul_f32(float* m1, float* m2, float* res,
__global__ uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
double out;
if (x < out_shape_1 && y < out_shape_0) {
= 0;
out for (int i = 0; i < inner_dim; i++) {
+= m1[y * m1_width + i] * m2[i * m2_width + x];
out }
[y * out_shape_1 + x] = out;
res}
}
void matmul_f32_row(float* m1, float* m2, float* res,
__global__ uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t)
{
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
if (y < out_shape_0) {
for (int x = 0; x < out_shape_1; x++) {
double out = 0;
for (int i = 0; i < inner_dim; i++) {
+= m1[y * m1_width + i] * m2[i * m2_width + x];
out }
[y * out_shape_1 + x] = out;
res}
}
}
void matmul_f32_col(float* m1, float* m2, float* res,
__global__ uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int m1_width = inner_dim;
int m2_width = out_shape_1;
if (x < out_shape_1) {
for (int y = 0; y < out_shape_1; y++) {
double out = 0;
for (int i = 0; i < inner_dim; i++) {
+= m1[y * m1_width + i] * m2[i * m2_width + x];
out }
[y * out_shape_1 + x] = out;
res}
}
}
Testing the thread per row matmul
= 1
BLOCK_SIZE_X = 32
BLOCK_SIZE_Y
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])
= (m1.shape[0], m2.shape[1])
out_shape
try:
= device.make_context()
ctx
= SourceModule(Path(cu_file2).read_text(),
mod =[
options'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
= mod.get_function("matmul_f32_row")
matmul_f32_row
= cuda.mem_alloc_like(m1)
gpu_m1 = cuda.mem_alloc_like(m2)
gpu_m2
= np.empty(out_shape, dtype=np.float32)
res_row
= cuda.mem_alloc_like(res_row)
gpu_res_row
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
= (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
block_size = (1, ((out_shape[0] + BLOCK_SIZE_Y - 1) // BLOCK_SIZE_Y), 1)
grid_size
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
matmul_f32_row(gpu_m1, gpu_m2, gpu_res_row, np.uint32(out_shape[# ctx.synchronize()
cuda.memcpy_dtoh(res_row, gpu_res_row)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res_row)
Grid size: (1, 4, 1)
Block size: (1, 32, 1)
Restul dimensions: 100x300
Total threads: 128
array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
Lo(res)
array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
== res_row).all() (res
np.True_
Testing one thread per col matnul
= 32
BLOCK_SIZE_X = 1
BLOCK_SIZE_Y
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])
= (m1.shape[0], m2.shape[1])
out_shape
try:
= device.make_context()
ctx
= SourceModule(Path(cu_file2).read_text(),
mod =[
options'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
= mod.get_function("matmul_f32_col")
matmul_f32_col
= cuda.mem_alloc_like(m1)
gpu_m1 = cuda.mem_alloc_like(m2)
gpu_m2
= np.empty(out_shape, dtype=np.float32)
res_col
= cuda.mem_alloc_like(res_col)
gpu_res_col
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
= (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
block_size = (((out_shape[1] + BLOCK_SIZE_X - 1) // BLOCK_SIZE_X),1,1)
grid_size
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
matmul_f32_col(gpu_m1, gpu_m2, gpu_res_col, np.uint32(out_shape[
ctx.synchronize()
cuda.memcpy_dtoh(res_col, gpu_res_col)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res_col)
Grid size: (10, 1, 1)
Block size: (32, 1, 1)
Restul dimensions: 100x300
Total threads: 320
array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150
== res).all() (res_col
np.True_
- Analyze the pros and cons of each of the two kernel designs.
They both suck, but because we are not using enough threads to saturate the GPU.
Row possibly sucks less because the cache is shared between thread blocks.