import time
import pandas as pd
import numpy as np
from math import prod
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import warn_options
Day 12 - conv2d with shared memory and halo
from lovely_numpy import Lo
from lovely_tensors import monkey_patch
monkey_patch()import torch
from torch import Tensor
from torch.nn.functional import conv2d
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
= cuda.Device(0)
device
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
print(f"Device:\t{device.name()}")
Cuda version: 12.8.0
Device: NVIDIA GeForce RTX 3080 Laptop GPU
= "kernels/conv2d/conv2d-z-out-shared-halo.cu" cu_file
Using double
for accumulator
I’ve noticed that we get a lot less discrepency between torch and my implementation when using a double for accumulator. Let’s benchmark the two.
= 16
tile_size
# test_cases = [(512, 128, 9, 64)]
# + ["-DDEBUG=1"],
= SourceModule(
mod
Path(cu_file).read_text(),=warn_options.warn_options + ["-DACCUM_DTYPE=float", f"-DTILE_SIZE={tile_size}"],
options=[str(Path("./kernels/conv2d/").absolute())]
include_dirs
)= SourceModule(
mod_double
Path(cu_file).read_text(),=warn_options.warn_options + ["-DACCUM_DTYPE=double", f"-DTILE_SIZE={tile_size}"],
options=[str(Path("./kernels/conv2d/").absolute())]
include_dirs
)
= {
benchmarks "conv2d_pad_z_out_shared_halo_float": (
benchmark_conv2d_pad_z_out_shared_halo,"conv2d_pad_z_out_shared_halo")
mod.get_function(
),"conv2d_pad_z_out_shared_halo_double": (
benchmark_conv2d_pad_z_out_shared_halo,"conv2d_pad_z_out_shared_halo")
mod_double.get_function(
)
}
def run_benchmarks(benchmarks, test_cases):
= []
data
for tc in tqdm(test_cases):
= tc
ch_in, ch_out, fs, pixels
= np.random.randn(ch_in, pixels, pixels).astype(np.float32)
array_in filter = np.random.randn(ch_out, ch_in, fs, fs).astype(np.float32)
= conv2d(Tensor(array_in), Tensor(filter), padding="same")
torch_out
= {}
timings
for benchmark_name, (benchmark_func, kernel) in benchmarks.items():
= benchmark_func(kernel, input=array_in, filter=filter, tile_size=tile_size, repeat=5)
res, timing
= float(np.isclose(res, torch_out, atol=1e-04, rtol=1e-4).mean())
similarity if similarity < 0.9:
print(f"## Mismatch for '{benchmark_name}")
print(f"In: {array_in.shape}")
print(f"Out: {(ch_out, pixels, pixels)}")
print(f"Filter: {filter.shape}")
print(f"Similarity: {similarity}")
# display(Lo(np.isclose(res, torch_out, atol=1e-04, rtol=1e-4)).chans(cl=False, scale=2))
# raise Exception
= timing
timings[benchmark_name] # time.sleep(10)
cuda.Context.synchronize()
data.append({'in_ch': ch_in,
'out_ch': ch_out,
'filter_size': fs,
'img_size': pixels,
# 'kernel': kernel_name,
| timings)
}
return pd.DataFrame(data)
= run_benchmarks(benchmarks, test_cases) results
Results
def plot_results(results, sort_column: str, timing_columns: list[str]):
# Sort by conv2d_pad timing
= results.sort_values(by=sort_column)
results_sorted
# Create a plot comparing the two kernels
import matplotlib.pyplot as plt
import seaborn as sns
# Create labels for x-axis that include dimensions
'dimensions'] = results_sorted.apply(
results_sorted[lambda row:
f"{int(row['img_size'])}×{int(row['img_size'])}×{int(row['in_ch'])} -> " +
f"{int(row['out_ch'])}, f:{int(row['filter_size'])}×{int(row['filter_size'])}",
=1
axis
)
# Melt the dataframe to get it in the right format for seaborn
= pd.melt(
melted_results
results_sorted,=['in_ch', 'out_ch', 'filter_size', 'img_size', 'dimensions'],
id_vars=timing_columns,
value_vars='kernel',
var_name='time'
value_name
)
# Split the data into two halves based on timing
= len(results_sorted) // 2
midpoint = melted_results[melted_results['dimensions'].isin(results_sorted['dimensions'][:midpoint])]
faster_results = melted_results[melted_results['dimensions'].isin(results_sorted['dimensions'][midpoint:])]
slower_results
# Create a figure with two subplots
= plt.subplots(2, 1, figsize=(12, 16))
fig, (ax1, ax2)
# Plot faster results in the first subplot
='dimensions', y='time', hue='kernel', data=faster_results, ax=ax1)
sns.barplot(x'')
ax1.set_xlabel('Time (ms)')
ax1.set_ylabel('Performance Comparison - Faster Results')
ax1.set_title(='x', rotation=90)
ax1.tick_params(axis='Kernel')
ax1.legend(title
# Plot slower results in the second subplot
='dimensions', y='time', hue='kernel', data=slower_results, ax=ax2)
sns.barplot(x'Input and Filter Dimensions')
ax2.set_xlabel('Time (ms)')
ax2.set_ylabel('Performance Comparison - Slower Results')
ax2.set_title(='x', rotation=90)
ax2.tick_params(axis='Kernel')
ax2.legend(title
# Adjust layout
plt.tight_layout()
plt.show()
plot_results(results,"conv2d_pad_z_out_shared_halo_float",
"conv2d_pad_z_out_shared_halo_float", "conv2d_pad_z_out_shared_halo_double"])
[
# Also display the sorted results table
# results_sorted
Ok, double is slow AF. I guess I’ll stick to the float accumulator. We will compare all the kernels to date tomorrow.