import pycuda.driver as cuda
cuda.init()Day 0 - playing with PyCUDA
Let’s see what kind of GPUs we got
MiB = 1024*1024
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
for i in range(cuda.Device.count()):
device = cuda.Device(i)
attrs = device.get_attributes()
context = device.make_context()
free_bytes, total_bytes = cuda.mem_get_info()
used_bytes = total_bytes - free_bytes
context.pop()
context.detach()
print(
f"Device {i}:\t{device.name()}\n"
f"\t\tCompute capability: {".".join([str(i) for i in device.compute_capability()])}\n"
f"\t\tVRAM used: {used_bytes // MiB}MiB / {total_bytes // MiB}MiB\n"
)Cuda version: 12.8.0
Device 0: NVIDIA GeForce RTX 3080 Laptop GPU
Compute capability: 8.6
VRAM used: 2129MiB / 15983MiB
!nvidia-smi | head -n 12Tue Mar 25 19:29:27 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01 Driver Version: 565.57.01 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 3080 ... Off | 00000000:01:00.0 On | N/A |
| N/A 57C P0 30W / 115W | 1966MiB / 16384MiB | 7% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
For some reason we get slightly less VRAM that’s shown by nvidia-smi. I guess it’s the memory reserved for CUDA stuff.
Let’s move some data in and out of the GPU.
import numpy as np
device = cuda.Device(0)
try:
ctx = device.make_context()
cpu_array = np.random.randn(1024,1024).astype(np.float32)
gpu_array = cuda.mem_alloc_like(cpu_array)
cuda.memcpy_htod(gpu_array, cpu_array)
cpu_array_2 = np.empty_like(cpu_array, dtype=np.float32)
cuda.memcpy_dtoh(cpu_array_2, gpu_array)
finally:
ctx.pop()
ctx.detach()(cpu_array == cpu_array_2).all()np.True_
Looks ok. Let’s try doing something with the data on the GPU.
from pycuda.compiler import SourceModule
ctx = device.make_context()
try:
# Slightly expanded code from their tutorial.
mod = SourceModule("""
__global__ void doublify(float *a)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int idx = y * blockDim.x * gridDim.x + x;
a[idx] *= 2;
}
""")
doublify = mod.get_function("doublify")
# For a 1024x1024 array, we use a 32x32 grid of 32x32 blocks.
block_size = (32,32,1)
grid_size = (32,32,1)
cpu_array = np.random.randn(1024, 1024).astype(np.float32)
gpu_array = cuda.mem_alloc_like(cpu_array)
cpu_array_2 = np.empty_like(cpu_array, dtype=np.float32)
cuda.memcpy_htod(gpu_array, cpu_array)
doublify(gpu_array, block=block_size, grid=grid_size)
cuda.memcpy_dtoh(cpu_array_2, gpu_array)
finally:
ctx.pop()
ctx.detach()(cpu_array_2 == (cpu_array * 2)).all()np.True_