import numpy as np
from PIL import Image
Day 3 - RGB blur
= Image.open("../cat-1.jpg")
image image
# Convert PIL Image to numpy array
= np.array(image)
img_array print(f"Image shape: {img_array.shape}")
= img_array.shape[0]
height = img_array.shape[1] width
Image shape: (600, 451, 3)
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
cuda.init()
= cuda.Device(0)
device
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
print(f"Device:\t{device.name()}")
Cuda version: 12.8.0
Device: NVIDIA GeForce RTX 3080 Laptop GPU
from pathlib import Path
= "kernels/misc/rgb_blur.cu" cu_file
kernels/misc/rgb_blur.cu
#include <stdint.h>
#include <stdio.h>
void rgb_blur(uint8_t *in, uint8_t *out, uint32_t w, uint32_t h, uint32_t blur) {
__global__
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < w && y < h) {
int idx = (y * w + x);
for (int ch = 0; ch < 3; ch++) {
uint32_t v = 0;
for (int j = -blur; j <= (int)blur; j++) {
for (int i = -blur; i <= (int)blur; i++) {
if (y + j >= 0 &&
+ j < h &&
y + i >= 0 &&
x + i < w) {
x += in[ ((y + j) * w + x + i)*3 + ch];
v }
}
}
[idx*3+ch] = (uint8_t)(v / ((2*blur + 1) * (2*blur + 1)));
out}
}
}
Testing the kernel
= 16
BLOCK_SIZE_X = 16
BLOCK_SIZE_Y
try:
= device.make_context()
ctx
= SourceModule(Path(cu_file).read_text(),
mod =[
options'-Xcompiler', '-Wall',
'-Xcompiler', '-Wextra',
'-Xcompiler', '-Wsign-conversion',
'-Xcompiler', '-Wcast-qual',
'-Xcompiler', '-Wunused-parameter',
'-Xcompiler', '-Wdouble-promotion',
'-Xcompiler', '-Wformat=2',
'-Xcompiler', '-Wfloat-equal',
'-Xcompiler', '-Wshadow'
]
)
= mod.get_function("rgb_blur")
rgb_blur
= cuda.mem_alloc_like(img_array)
gpu_in = cuda.mem_alloc_like(img_array)
gpu_out
cuda.memcpy_htod(gpu_in, img_array)
= (BLOCK_SIZE_X, BLOCK_SIZE_Y, 1)
block_size = (
grid_size + BLOCK_SIZE_X - 1) // BLOCK_SIZE_X),
((width + BLOCK_SIZE_Y - 1) // BLOCK_SIZE_Y),
((height 1
)
print(f"Grid size: {grid_size}")
print(f"Block size: {block_size}")
print(f"Image dimensions: {width}x{height}")
print(f"Total threads: {grid_size[0] * grid_size[1] * block_size[0] * block_size[1]}")
3), block=block_size, grid=grid_size)
rgb_blur(gpu_in, gpu_out, np.uint32(width), np.uint32(height), np.int32(
= np.empty_like(img_array)
cpu_out
cuda.memcpy_dtoh(cpu_out, gpu_out)
finally:
ctx.pop() ctx.detach()
Grid size: (29, 38, 1)
Block size: (16, 16, 1)
Image dimensions: 451x600
Total threads: 282112
Image.fromarray(cpu_out)