import time
import pandas as pd
import numpy as np
from math import prod
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import warn_optionsDay 12 - conv2d with shared memory and halo
from lovely_numpy import Lo
from lovely_tensors import monkey_patch
monkey_patch()
import torch
from torch import Tensor
from torch.nn.functional import conv2dimport pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
device = cuda.Device(0)
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
print(f"Device:\t{device.name()}")Cuda version: 12.8.0
Device: NVIDIA GeForce RTX 3080 Laptop GPU
cu_file = "kernels/conv2d/conv2d-z-out-shared-halo.cu"Using double for accumulator
I’ve noticed that we get a lot less discrepency between torch and my implementation when using a double for accumulator. Let’s benchmark the two.
tile_size = 16
# test_cases = [(512, 128, 9, 64)]
# + ["-DDEBUG=1"],
mod = SourceModule(
Path(cu_file).read_text(),
options=warn_options.warn_options + ["-DACCUM_DTYPE=float", f"-DTILE_SIZE={tile_size}"],
include_dirs=[str(Path("./kernels/conv2d/").absolute())]
)
mod_double = SourceModule(
Path(cu_file).read_text(),
options=warn_options.warn_options + ["-DACCUM_DTYPE=double", f"-DTILE_SIZE={tile_size}"],
include_dirs=[str(Path("./kernels/conv2d/").absolute())]
)
benchmarks = {
"conv2d_pad_z_out_shared_halo_float": (
benchmark_conv2d_pad_z_out_shared_halo,
mod.get_function("conv2d_pad_z_out_shared_halo")
),
"conv2d_pad_z_out_shared_halo_double": (
benchmark_conv2d_pad_z_out_shared_halo,
mod_double.get_function("conv2d_pad_z_out_shared_halo")
)
}
def run_benchmarks(benchmarks, test_cases):
data = []
for tc in tqdm(test_cases):
ch_in, ch_out, fs, pixels = tc
array_in = np.random.randn(ch_in, pixels, pixels).astype(np.float32)
filter = np.random.randn(ch_out, ch_in, fs, fs).astype(np.float32)
torch_out = conv2d(Tensor(array_in), Tensor(filter), padding="same")
timings = {}
for benchmark_name, (benchmark_func, kernel) in benchmarks.items():
res, timing = benchmark_func(kernel, input=array_in, filter=filter, tile_size=tile_size, repeat=5)
similarity = float(np.isclose(res, torch_out, atol=1e-04, rtol=1e-4).mean())
if similarity < 0.9:
print(f"## Mismatch for '{benchmark_name}")
print(f"In: {array_in.shape}")
print(f"Out: {(ch_out, pixels, pixels)}")
print(f"Filter: {filter.shape}")
print(f"Similarity: {similarity}")
# display(Lo(np.isclose(res, torch_out, atol=1e-04, rtol=1e-4)).chans(cl=False, scale=2))
# raise Exception
timings[benchmark_name] = timing
# time.sleep(10)
cuda.Context.synchronize()
data.append({
'in_ch': ch_in,
'out_ch': ch_out,
'filter_size': fs,
'img_size': pixels,
# 'kernel': kernel_name,
} | timings)
return pd.DataFrame(data)results = run_benchmarks(benchmarks, test_cases)Results
def plot_results(results, sort_column: str, timing_columns: list[str]):
# Sort by conv2d_pad timing
results_sorted = results.sort_values(by=sort_column)
# Create a plot comparing the two kernels
import matplotlib.pyplot as plt
import seaborn as sns
# Create labels for x-axis that include dimensions
results_sorted['dimensions'] = results_sorted.apply(
lambda row:
f"{int(row['img_size'])}×{int(row['img_size'])}×{int(row['in_ch'])} -> " +
f"{int(row['out_ch'])}, f:{int(row['filter_size'])}×{int(row['filter_size'])}",
axis=1
)
# Melt the dataframe to get it in the right format for seaborn
melted_results = pd.melt(
results_sorted,
id_vars=['in_ch', 'out_ch', 'filter_size', 'img_size', 'dimensions'],
value_vars=timing_columns,
var_name='kernel',
value_name='time'
)
# Split the data into two halves based on timing
midpoint = len(results_sorted) // 2
faster_results = melted_results[melted_results['dimensions'].isin(results_sorted['dimensions'][:midpoint])]
slower_results = melted_results[melted_results['dimensions'].isin(results_sorted['dimensions'][midpoint:])]
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 16))
# Plot faster results in the first subplot
sns.barplot(x='dimensions', y='time', hue='kernel', data=faster_results, ax=ax1)
ax1.set_xlabel('')
ax1.set_ylabel('Time (ms)')
ax1.set_title('Performance Comparison - Faster Results')
ax1.tick_params(axis='x', rotation=90)
ax1.legend(title='Kernel')
# Plot slower results in the second subplot
sns.barplot(x='dimensions', y='time', hue='kernel', data=slower_results, ax=ax2)
ax2.set_xlabel('Input and Filter Dimensions')
ax2.set_ylabel('Time (ms)')
ax2.set_title('Performance Comparison - Slower Results')
ax2.tick_params(axis='x', rotation=90)
ax2.legend(title='Kernel')
# Adjust layout
plt.tight_layout()
plt.show()
plot_results(results,
"conv2d_pad_z_out_shared_halo_float",
["conv2d_pad_z_out_shared_halo_float", "conv2d_pad_z_out_shared_halo_double"])
# Also display the sorted results table
# results_sortedOk, double is slow AF. I guess I’ll stick to the float accumulator. We will compare all the kernels to date tomorrow.