Day 4 - Naive matmul+exercises

import numpy as np
from PIL import Image

import pycuda.driver as cuda
from pycuda.compiler import SourceModule
cuda.init()

device = cuda.Device(0)

print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
print(f"Device:\t{device.name()}")

Cuda version: 12.8.0
Device: NVIDIA GeForce RTX 3080 Laptop GPU

from pathlib import Path

cu_file = "kernels/matmul/matmul.cu"

Naive matmul kernels/matmul/matmul.cu

#include <stdint.h>
#include <stdio.h>

__global__ void matmul_f32(float *m1, float *m2, float *res,
    uint32_t out_shape_0,
    uint32_t out_shape_1,
    uint32_t inner_dim,
    uint32_t ) {

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    int m1_width = inner_dim;
    int m2_width = out_shape_1;

    double out;
    if (x < out_shape_1 && y < out_shape_0) {
        out = 0;
        for (int i = 0; i < inner_dim; i++) {
            out += m1[y*m1_width + i] * m2[i*m2_width + x];
        }
        res[y*out_shape_1 + x] = out;
    }
}

from lovely_numpy import Lo

m1 = np.random.randn(100, 200).astype(np.float32)
m2 = np.random.randn(200, 300).astype(np.float32)

np_res = np.matmul(m1, m2)
Lo(np_res)

array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150

Testing naive matnul

BLOCK_SIZE_X = 32
BLOCK_SIZE_Y = 32

assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])

out_shape = (m1.shape[0], m2.shape[1])

try:
    ctx = device.make_context()

    mod = SourceModule(Path(cu_file).read_text(),
        options=[
            '-Xcompiler', '-Wall',
            '-Xcompiler', '-Wextra',
            '-Xcompiler', '-Wsign-conversion',
            '-Xcompiler', '-Wcast-qual',
            '-Xcompiler', '-Wunused-parameter',
            '-Xcompiler', '-Wdouble-promotion',
            '-Xcompiler', '-Wformat=2',
            '-Xcompiler', '-Wfloat-equal',
            '-Xcompiler', '-Wshadow'
        ]
        )

    matmul_f32 = mod.get_function("matmul_f32")

    gpu_m1 = cuda.mem_alloc_like(m1)
    gpu_m2 = cuda.mem_alloc_like(m2)

    res = np.empty(out_shape, dtype=np.float32)

    gpu_res = cuda.mem_alloc_like(res)


    cuda.memcpy_htod(gpu_m1, m1)
    cuda.memcpy_htod(gpu_m2, m2)

    block_size = (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
    grid_size = (
        ((out_shape[1] + BLOCK_SIZE_X - 1) // BLOCK_SIZE_X),
        ((out_shape[0] + BLOCK_SIZE_Y - 1) // BLOCK_SIZE_Y),
        1
    )

    print(f"Grid size: {grid_size}")
    print(f"Block size: {block_size}")
    print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
    print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")

    ctx.synchronize()

    matmul_f32(gpu_m1, gpu_m2, gpu_res, np.uint32(out_shape[0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)

    ctx.synchronize()

    cuda.memcpy_dtoh(res, gpu_res)
    ctx.synchronize()


finally:
    ctx.pop()
    ctx.detach()

Lo(res)

Grid size: (10, 4, 1)
Block size: (32, 32, 1)
Restul dimensions: 100x300
Total threads: 40960

array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150

m1[0,1],m2[1,0]

(np.float32(-0.9388011), np.float32(0.16297509))

np.isclose(res, np_res).all()

np.False_

Lo(np.isclose(res, np_res)).chans

Nunmeric stability

Looks like matmul is very succeptible to numerical instability. > Since we are adding numbers to the accumulator over and over, if the accumulated value gets large enough, > it will lose precision to correctly accumulate small values. If it then gets a large update opposite of accumulated value > and becomes small again, those errors will become very significant.

But I think overall it’s correct. I changed to accumulator to be double, and still seeting the discrepancy. It’s possible that numpy matmul also not not very precise.

Exercises

Let’s do the exercises

In this chapter we implemented a matrix multiplication kernel that has each thread produce one output matrix element. In this question, you will implement different matrix-matrix multiplication kernels and compare them.
- 1. Write a kernel that has each thread produce one output matrix row. Fill in the execution configuration parameters for the design.
- 1. Write a kernel that has each thread produce one output matrix column. Fill in the execution configuration parameters for the design.
- 1. Analyze the pros and cons of each of the two kernel designs.

cu_file2 = "kernels/matmul/matmul-row_col.cu"

One thread per row/col:

Naive matmul row/col kernels/matmul/matmul-row_col.cu

#include <stdint.h>
#include <stdio.h>

__global__ void matmul_f32(float* m1, float* m2, float* res,
                           uint32_t out_shape_0,
                           uint32_t out_shape_1,
                           uint32_t inner_dim)
{

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    int m1_width = inner_dim;
    int m2_width = out_shape_1;

    double out;
    if (x < out_shape_1 && y < out_shape_0) {
        out = 0;
        for (int i = 0; i < inner_dim; i++) {
            out += m1[y * m1_width + i] * m2[i * m2_width + x];
        }
        res[y * out_shape_1 + x] = out;
    }
}

__global__ void matmul_f32_row(float* m1, float* m2, float* res,
                               uint32_t out_shape_0,
                               uint32_t out_shape_1,
                               uint32_t inner_dim,
                               uint32_t)

{
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int m1_width = inner_dim;
    int m2_width = out_shape_1;

    if (y < out_shape_0) {
        for (int x = 0; x < out_shape_1; x++) {
            double out = 0;
            for (int i = 0; i < inner_dim; i++) {
                out += m1[y * m1_width + i] * m2[i * m2_width + x];
            }
            res[y * out_shape_1 + x] = out;
        }
    }

}

__global__ void matmul_f32_col(float* m1, float* m2, float* res,
                               uint32_t out_shape_0,
                               uint32_t out_shape_1,
                               uint32_t inner_dim,
                               uint32_t)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int m1_width = inner_dim;
    int m2_width = out_shape_1;

    if (x < out_shape_1) {
        for (int y = 0; y < out_shape_1; y++) {
            double out = 0;
            for (int i = 0; i < inner_dim; i++) {
                out += m1[y * m1_width + i] * m2[i * m2_width + x];
            }
            res[y * out_shape_1 + x] = out;
        }
    }
}

Testing the thread per row matmul

BLOCK_SIZE_X = 1
BLOCK_SIZE_Y = 32

assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])

out_shape = (m1.shape[0], m2.shape[1])

try:
    ctx = device.make_context()

    mod = SourceModule(Path(cu_file2).read_text(),
        options=[
            '-Xcompiler', '-Wall',
            '-Xcompiler', '-Wextra',
            '-Xcompiler', '-Wsign-conversion',
            '-Xcompiler', '-Wcast-qual',
            '-Xcompiler', '-Wunused-parameter',
            '-Xcompiler', '-Wdouble-promotion',
            '-Xcompiler', '-Wformat=2',
            '-Xcompiler', '-Wfloat-equal',
            '-Xcompiler', '-Wshadow'
        ]
        )

    matmul_f32_row = mod.get_function("matmul_f32_row")

    gpu_m1 = cuda.mem_alloc_like(m1)
    gpu_m2 = cuda.mem_alloc_like(m2)

    res_row = np.empty(out_shape, dtype=np.float32)

    gpu_res_row = cuda.mem_alloc_like(res_row)


    cuda.memcpy_htod(gpu_m1, m1)
    cuda.memcpy_htod(gpu_m2, m2)

    block_size = (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
    grid_size = (1, ((out_shape[0] + BLOCK_SIZE_Y - 1) // BLOCK_SIZE_Y), 1)

    print(f"Grid size: {grid_size}")
    print(f"Block size: {block_size}")
    print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
    print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")

    ctx.synchronize()

    matmul_f32_row(gpu_m1, gpu_m2, gpu_res_row, np.uint32(out_shape[0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
    # ctx.synchronize()

    cuda.memcpy_dtoh(res_row, gpu_res_row)
    ctx.synchronize()


finally:
    ctx.pop()
    ctx.detach()

Lo(res_row)

Grid size: (1, 4, 1)
Block size: (1, 32, 1)
Restul dimensions: 100x300
Total threads: 128

array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150

Lo(res)

array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150

(res == res_row).all()

np.True_

Testing one thread per col matnul

BLOCK_SIZE_X = 32
BLOCK_SIZE_Y = 1

assert(len(m1.shape) == 2)
assert(len(m2.shape) == 2)
assert(m1.shape[1] == m2.shape[0])

out_shape = (m1.shape[0], m2.shape[1])

try:
    ctx = device.make_context()

    mod = SourceModule(Path(cu_file2).read_text(),
        options=[
            '-Xcompiler', '-Wall',
            '-Xcompiler', '-Wextra',
            '-Xcompiler', '-Wsign-conversion',
            '-Xcompiler', '-Wcast-qual',
            '-Xcompiler', '-Wunused-parameter',
            '-Xcompiler', '-Wdouble-promotion',
            '-Xcompiler', '-Wformat=2',
            '-Xcompiler', '-Wfloat-equal',
            '-Xcompiler', '-Wshadow'
        ]
        )

    matmul_f32_col = mod.get_function("matmul_f32_col")

    gpu_m1 = cuda.mem_alloc_like(m1)
    gpu_m2 = cuda.mem_alloc_like(m2)

    res_col = np.empty(out_shape, dtype=np.float32)

    gpu_res_col = cuda.mem_alloc_like(res_col)


    cuda.memcpy_htod(gpu_m1, m1)
    cuda.memcpy_htod(gpu_m2, m2)

    block_size = (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
    grid_size = (((out_shape[1] + BLOCK_SIZE_X - 1) // BLOCK_SIZE_X),1,1)

    print(f"Grid size: {grid_size}")
    print(f"Block size: {block_size}")
    print(f"Restul dimensions: {out_shape[0]}x{out_shape[1]}")
    print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")

    ctx.synchronize()

    matmul_f32_col(gpu_m1, gpu_m2, gpu_res_col, np.uint32(out_shape[0]), np.uint32(out_shape[1]), np.uint32(m1.shape[1]), grid=grid_size, block=block_size)
    ctx.synchronize()

    cuda.memcpy_dtoh(res_col, gpu_res_col)
    ctx.synchronize()


finally:
    ctx.pop()
    ctx.detach()

Lo(res_col)

Grid size: (10, 1, 1)
Block size: (32, 1, 1)
Restul dimensions: 100x300
Total threads: 320

array[100, 300] f32 n=30000 (0.1Mb) x∈[-57.284, 65.644] μ=-0.102 σ=14.150

(res_col == res).all()

np.True_

1. Analyze the pros and cons of each of the two kernel designs.

They both suck, but because we are not using enough threads to saturate the GPU.
Row possibly sucks less because the cache is shared between thread blocks.