import pycuda.driver as cuda
cuda.init()
Day 0 - playing with PyCUDA
Let’s see what kind of GPUs we got
= 1024*1024
MiB
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
for i in range(cuda.Device.count()):
= cuda.Device(i)
device
= device.get_attributes()
attrs = device.make_context()
context
= cuda.mem_get_info()
free_bytes, total_bytes = total_bytes - free_bytes
used_bytes
context.pop()
context.detach()
print(
f"Device {i}:\t{device.name()}\n"
f"\t\tCompute capability: {".".join([str(i) for i in device.compute_capability()])}\n"
f"\t\tVRAM used: {used_bytes // MiB}MiB / {total_bytes // MiB}MiB\n"
)
Cuda version: 12.8.0
Device 0: NVIDIA GeForce RTX 3080 Laptop GPU
Compute capability: 8.6
VRAM used: 1913MiB / 15983MiB
!nvidia-smi | head -n 12
Sun Feb 9 03:05:54 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01 Driver Version: 565.57.01 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 3080 ... Off | 00000000:01:00.0 On | N/A |
| N/A 59C P0 29W / 115W | 1757MiB / 16384MiB | 9% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
For some reason we get slightly less VRAM that’s shown by nvidia-smi. I guess it’s the memory reserved for CUDA stuff.
Let’s move some data in and out of the GPU.
import numpy as np
= cuda.Device(0)
device
try:
= device.make_context()
ctx
= np.random.randn(1024,1024).astype(np.float32)
cpu_array = cuda.mem_alloc_like(cpu_array)
gpu_array
cuda.memcpy_htod(gpu_array, cpu_array)
= np.empty_like(cpu_array, dtype=np.float32)
cpu_array_2
cuda.memcpy_dtoh(cpu_array_2, gpu_array)
finally:
ctx.pop() ctx.detach()
== cpu_array_2).all() (cpu_array
np.True_
Looks ok. Let’s try doing something with the data on the GPU.
from pycuda.compiler import SourceModule
= device.make_context()
ctx
try:
# Slightly expanded code from their tutorial.
= SourceModule("""
mod __global__ void doublify(float *a)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int idx = y * blockDim.x * gridDim.x + x;
a[idx] *= 2;
}
""")
= mod.get_function("doublify")
doublify
# For a 1024x1024 array, we use a 32x32 grid of 32x32 blocks.
= (32,32,1)
block_size = (32,32,1)
grid_size
= np.random.randn(1024, 1024).astype(np.float32)
cpu_array = cuda.mem_alloc_like(cpu_array)
gpu_array = np.empty_like(cpu_array, dtype=np.float32)
cpu_array_2
cuda.memcpy_htod(gpu_array, cpu_array)
=block_size, grid=grid_size)
doublify(gpu_array, block
cuda.memcpy_dtoh(cpu_array_2, gpu_array)
finally:
ctx.pop() ctx.detach()
== (cpu_array * 2)).all() (cpu_array_2
np.True_