import numpy as np
from PIL import Image
Day 6 - Tiled matmul
Let’start with a square matrix that is multiple of block width.
TODO: Check what’s the performance penalty of the boundary check. It might be better to just force matrices to be multiple of block size with padding.
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
cuda.init()
= cuda.Device(0)
device
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
print(f"Device:\t{device.name()}")
Cuda version: 12.8.0
Device: NVIDIA GeForce RTX 3080 Laptop GPU
from pathlib import Path
#include <stdint.h>
#include <stdio.h>
// We will use square blocks to keep things sane.
#define BLOCK_WIDTH 16
__global__ void matmul_fp32_tiled(float* m1, float* m2, float* res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ float m1_tile[TILE_WIDTH][TILE_WIDTH];
__shared__ float m2_tile[TILE_WIDTH][TILE_WIDTH];
int m1_x = inner_dim;
int m1_y = out_shape_0;
int m2_x = out_shape_1;
int m2_y = inner_dim;
// Assume the matrices are multiples my block size on both dims.
float R = 0;
for (int tile = 0; tile < inner_dim / TILE_WIDTH; tile++) {
m1_tile[threadIdx.y][threadIdx.x] = m1[y * m1_x + tile * TILE_WIDTH + threadIdx.x];
m2_tile[threadIdx.y][threadIdx.x] = m2[(tile * TILE_WIDTH + threadIdx.y) * m2_x + x];
__syncthreads();
for (int i = 0; i < TILE_WIDTH; i++) {
R += m1_tile[threadIdx.y][i] * m2_tile[i][threadIdx.x];
}
__syncthreads();
}
res[y * out_shape_1 + x] = R;
}
// Non-tiled version
__global__ void matmul_f32(float* m1, float* m2, float* res,
uint32_t out_shape_0,
uint32_t out_shape_1,
uint32_t inner_dim,
uint32_t) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int m1_width = inner_dim;
int m2_width = out_shape_1;
float out;
if (x < out_shape_1 && y < out_shape_0) {
out = 0;
for (int i = 0; i < inner_dim; i++) {
out += m1[y * m1_width + i] * m2[i * m2_width + x];
}
res[y * out_shape_1 + x] = out;
}
}
from lovely_numpy import Lo
= np.random.randn(1024, 1024).astype(np.float32)
m1 = np.random.randn(1024, 1024).astype(np.float32)
m2
= np.matmul(m1, m2)
np_res Lo(np_res)
array[1024, 1024] f32 n=1048576 (4Mb) x∈[-161.923, 148.814] μ=-0.029 σ=31.993
= 16 # 16x16
BLOCK_SIZE
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape == m2.shape) # Make them equal for now
= (m1.shape[0], m2.shape[1])
out_shape
try:
= device.make_context()
ctx
= SourceModule(Path("day_06_matmul-tiled.cu").read_text(),
mod =[
options'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
= mod.get_function("matmul_fp32_tiled")
matmul_tiled
= cuda.mem_alloc_like(m1)
gpu_m1 = cuda.mem_alloc_like(m2)
gpu_m2
= np.empty(out_shape, dtype=np.float32)
res
= cuda.mem_alloc_like(res)
gpu_res
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
= (BLOCK_SIZE, BLOCK_SIZE, 1)
block_size = (
grid_size 1] + BLOCK_SIZE - 1) // BLOCK_SIZE),
((out_shape[0] + BLOCK_SIZE - 1) // BLOCK_SIZE),
((out_shape[1
)
print(f"Matrix 1 shape: {m1.shape}")
print(f"Matrix 2 shape: {m2.shape}")
print(f"Result shape: {out_shape}")
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
1]), np.uint32(out_shape[0]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
matmul_tiled(gpu_m1, gpu_m2, gpu_res, np.uint32(out_shape[
ctx.synchronize()
cuda.memcpy_dtoh(res, gpu_res)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res)
--------------------------------------------------------------------------- CompileError Traceback (most recent call last) Cell In[18], line 12 9 try: 10 ctx = device.make_context() ---> 12 mod = SourceModule(Path("day_06_matmul-tiled.cu").read_text(), 13 options=[ 14 '-Xcompiler', '-Wall', 15 '-Xcompiler', '-Wextra', 16 '-Xcompiler', '-Wsign-conversion', 17 '-Xcompiler', '-Wcast-qual', 18 '-Xcompiler', '-Wunused-parameter', 19 '-Xcompiler', '-Wdouble-promotion', 20 '-Xcompiler', '-Wformat=2', 21 '-Xcompiler', '-Wfloat-equal', 22 '-Xcompiler', '-Wshadow' 23 ] 24 ) 26 matmul_tiled = mod.get_function("matmul_fp32_tiled") 28 gpu_m1 = cuda.mem_alloc_like(m1) File ~/mambaforge/envs/cuda/lib/python3.12/site-packages/pycuda/compiler.py:348, in SourceModule.__init__(self, source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs) 334 def __init__( 335 self, 336 source, (...) 344 include_dirs=[], 345 ): 346 self._check_arch(arch) --> 348 cubin = compile( 349 source, 350 nvcc, 351 options, 352 keep, 353 no_extern_c, 354 arch, 355 code, 356 cache_dir, 357 include_dirs, 358 ) 360 from pycuda.driver import module_from_buffer 362 self.module = module_from_buffer(cubin) File ~/mambaforge/envs/cuda/lib/python3.12/site-packages/pycuda/compiler.py:297, in compile(source, nvcc, options, keep, no_extern_c, arch, code, cache_dir, include_dirs, target) 294 for i in include_dirs: 295 options.append("-I" + i) --> 297 return compile_plain(source, options, keep, nvcc, cache_dir, target) File ~/mambaforge/envs/cuda/lib/python3.12/site-packages/pycuda/compiler.py:154, in compile_plain(source, options, keep, nvcc, cache_dir, target) 148 warn( 149 "PyCUDA: nvcc exited with status 0, but appears to have " 150 "encountered an error" 151 ) 152 from pycuda.driver import CompileError --> 154 raise CompileError( 155 "nvcc compilation of %s failed" % cu_file_path, 156 cmdline, 157 stdout=stdout.decode("utf-8", "replace"), 158 stderr=stderr.decode("utf-8", "replace"), 159 ) 161 if stdout or stderr: 162 lcase_err_text = (stdout + stderr).decode("utf-8", "replace").lower() CompileError: nvcc compilation of /tmp/tmp__x_n_xt/kernel.cu failed [command: nvcc --cubin -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wsign-conversion -Xcompiler -Wcast-qual -Xcompiler -Wunused-parameter -Xcompiler -Wdouble-promotion -Xcompiler -Wformat=2 -Xcompiler -Wfloat-equal -Xcompiler -Wshadow -arch sm_86 -I/home/xl0/mambaforge/envs/cuda/lib/python3.12/site-packages/pycuda/cuda kernel.cu] [stderr: kernel.cu(19): error: identifier "TILE_WIDTH" is undefined __attribute__((shared)) float m1_tile[TILE_WIDTH][TILE_WIDTH]; ^ kernel.cu(24): warning #177-D: variable "m1_y" was declared but never referenced int m1_y = out_shape_0; ^ Remark: The warnings can be suppressed with "-diag-suppress <warning-number>" kernel.cu(27): warning #177-D: variable "m2_y" was declared but never referenced int m2_y = inner_dim; ^ 1 error detected in the compilation of "kernel.cu". ]
np.isclose(res, np_res)
array([[ True, True, True, ..., True, True, True],
[ True, True, True, ..., True, True, True],
[ True, True, True, ..., True, True, True],
...,
[ True, True, True, ..., True, True, True],
[ True, False, True, ..., True, True, True],
[ True, True, True, ..., True, True, True]],
shape=(1024, 1024))
np.isclose(res, np_res).mean()
np.float64(0.9794473648071289)
We have the same numerical error situation as with naive matmul, let’s compare with the non-tiled result.
= 16 # 16x16
BLOCK_SIZE
assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape == m2.shape) # Make them equal for now
= (m1.shape[0], m2.shape[1])
out_shape
try:
= device.make_context()
ctx
= SourceModule(Path("day_06_matmul-tiled.cu").read_text(),
mod =[
options'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
= mod.get_function("matmul_f32")
matmul_naive
= cuda.mem_alloc_like(m1)
gpu_m1 = cuda.mem_alloc_like(m2)
gpu_m2
= np.empty(out_shape, dtype=np.float32)
res_naive
= cuda.mem_alloc_like(res)
gpu_res_naive
cuda.memcpy_htod(gpu_m1, m1)
cuda.memcpy_htod(gpu_m2, m2)
= (BLOCK_SIZE, BLOCK_SIZE, 1)
block_size = (
grid_size 1] + BLOCK_SIZE - 1) // BLOCK_SIZE),
((out_shape[0] + BLOCK_SIZE - 1) // BLOCK_SIZE),
((out_shape[1
)
print(f"Matrix 1 shape: {m1.shape}")
print(f"Matrix 2 shape: {m2.shape}")
print(f"Result shape: {out_shape}")
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
ctx.synchronize()
1]), np.uint32(out_shape[0]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
matmul_naive(gpu_m1, gpu_m2, gpu_res_naive, np.uint32(out_shape[
ctx.synchronize()
cuda.memcpy_dtoh(res_naive, gpu_res_naive)
ctx.synchronize()
finally:
ctx.pop()
ctx.detach()
Lo(res_naive)
Matrix 1 shape: (1024, 1024)
Matrix 2 shape: (1024, 1024)
Result shape: (1024, 1024)
Grid size: (64, 64, 1)
Block size: (16, 16, 1)
Total threads: 1048576
array[1024, 1024] f32 n=1048576 (4Mb) x∈[-157.149, 144.693] μ=-0.019 σ=31.945
== res).all() (res_naive
np.True_
Yaaay, they match. This was the first attempt based on memory/understading of what I read in chapter 4, and it worked on the first try 😎