import pandas as pd
import numpy as np
from math import prod
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
Day 11 - conv2d with shared memory
from lovely_numpy import Lo
from lovely_tensors import monkey_patch; monkey_patch()
from torch import Tensor
from torch.nn.functional import conv2d
import warn_options
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
cuda.init()
= cuda.Device(0)
device
print(f"Cuda version: {".".join([str(i) for i in cuda.get_version()])}")
print(f"Device:\t{device.name()}")
Cuda version: 12.8.0
Device: NVIDIA GeForce RTX 3080 Laptop GPU
="kernels/conv2d/conv2d-z-out-shared.cu" cu_file
Results
# Sort by conv2d_pad timing
= results.sort_values(by='conv2d_pad')
results_sorted
# Create a plot comparing the two kernels
import matplotlib.pyplot as plt
import seaborn as sns
# Create labels for x-axis that include dimensions
'dimensions'] = results_sorted.apply(
results_sorted[lambda row: f"{int(row['img_size'])}×{int(row['img_size'])}×{int(row['in_ch'])} -> {int(row['out_ch'])}, f:{int(row['filter_size'])}×{int(row['filter_size'])}",
=1
axis
)
# Melt the dataframe to get it in the right format for seaborn
= pd.melt(
melted_results
results_sorted,=['in_ch', 'out_ch', 'filter_size', 'img_size', 'dimensions'],
id_vars=['conv2d_pad', 'conv2d_pad_z_out', 'conv2d_pad_z_out_shared'],
value_vars='kernel',
var_name='time'
value_name
)
# Split the data into two halves based on timing
= len(results_sorted) // 2
midpoint = melted_results[melted_results['dimensions'].isin(results_sorted['dimensions'][:midpoint])]
faster_results = melted_results[melted_results['dimensions'].isin(results_sorted['dimensions'][midpoint:])]
slower_results
# Create a figure with two subplots
= plt.subplots(2, 1, figsize=(12, 16))
fig, (ax1, ax2)
# Plot faster results in the first subplot
='dimensions', y='time', hue='kernel', data=faster_results, ax=ax1)
sns.barplot(x'')
ax1.set_xlabel('Time (ms)')
ax1.set_ylabel('Performance Comparison - Faster Results')
ax1.set_title(='x', rotation=90)
ax1.tick_params(axis='Kernel')
ax1.legend(title
# Plot slower results in the second subplot
='dimensions', y='time', hue='kernel', data=slower_results, ax=ax2)
sns.barplot(x'Input and Filter Dimensions')
ax2.set_xlabel('Time (ms)')
ax2.set_ylabel('Performance Comparison - Slower Results')
ax2.set_title(='x', rotation=90)
ax2.tick_params(axis='Kernel')
ax2.legend(title
# Adjust layout
plt.tight_layout()
plt.show()
# Also display the sorted results table
results_sorted
in_ch | out_ch | filter_size | img_size | conv2d_pad_z_out_shared | conv2d_pad | conv2d_pad_z_out | dimensions | |
---|---|---|---|---|---|---|---|---|
18 | 1 | 4 | 1 | 128 | 0.015974 | 0.014950 | 0.014336 | 128×128×1 -> 4, f:1×1 |
47 | 1 | 1 | 3 | 64 | 0.016794 | 0.018637 | 0.014541 | 64×64×1 -> 1, f:3×3 |
21 | 3 | 1 | 3 | 256 | 0.037274 | 0.027034 | 0.025805 | 256×256×3 -> 1, f:3×3 |
23 | 1 | 8 | 3 | 64 | 0.022733 | 0.033587 | 0.020480 | 64×64×1 -> 8, f:3×3 |
43 | 3 | 8 | 3 | 64 | 0.025395 | 0.058573 | 0.019866 | 64×64×3 -> 8, f:3×3 |
13 | 3 | 1 | 3 | 512 | 0.092570 | 0.058778 | 0.058163 | 512×512×3 -> 1, f:3×3 |
12 | 32 | 1 | 1 | 256 | 0.106086 | 0.062259 | 0.064922 | 256×256×32 -> 1, f:1×1 |
24 | 1 | 4 | 3 | 512 | 0.120627 | 0.073114 | 0.084378 | 512×512×1 -> 4, f:3×3 |
30 | 1 | 32 | 3 | 64 | 0.030106 | 0.080077 | 0.109363 | 64×64×1 -> 32, f:3×3 |
33 | 32 | 4 | 1 | 128 | 0.085811 | 0.086221 | 0.056525 | 128×128×32 -> 4, f:1×1 |
48 | 3 | 32 | 1 | 256 | 0.186368 | 0.126362 | 0.129638 | 256×256×3 -> 32, f:1×1 |
20 | 1 | 32 | 5 | 128 | 0.128205 | 0.169984 | 0.084173 | 128×128×1 -> 32, f:5×5 |
0 | 1 | 128 | 1 | 128 | 0.093184 | 0.192512 | 0.073523 | 128×128×1 -> 128, f:1×1 |
26 | 3 | 128 | 1 | 128 | 0.180838 | 0.233882 | 0.126566 | 128×128×3 -> 128, f:1×1 |
16 | 1 | 512 | 1 | 128 | 0.319693 | 0.369664 | 0.244326 | 128×128×1 -> 512, f:1×1 |
14 | 8 | 4 | 3 | 512 | 0.820224 | 0.403046 | 0.499302 | 512×512×8 -> 4, f:3×3 |
34 | 32 | 32 | 1 | 128 | 0.395059 | 0.556237 | 0.237363 | 128×128×32 -> 32, f:1×1 |
32 | 3 | 128 | 3 | 64 | 0.141722 | 0.725811 | 0.089702 | 64×64×3 -> 128, f:3×3 |
44 | 512 | 1 | 1 | 256 | 1.329971 | 0.743629 | 0.754483 | 256×256×512 -> 1, f:1×1 |
37 | 8 | 8 | 1 | 1024 | 1.926963 | 0.826368 | 1.283072 | 1024×1024×8 -> 8, f:1×1 |
25 | 8 | 4 | 5 | 512 | 1.817190 | 0.915046 | 0.969114 | 512×512×8 -> 4, f:5×5 |
38 | 32 | 32 | 1 | 256 | 1.848525 | 1.133158 | 1.107354 | 256×256×32 -> 32, f:1×1 |
1 | 3 | 32 | 1 | 1024 | 3.284378 | 1.242317 | 2.644992 | 1024×1024×3 -> 32, f:1×1 |
22 | 1 | 128 | 5 | 256 | 1.764147 | 1.248870 | 1.013350 | 256×256×1 -> 128, f:5×5 |
3 | 512 | 1 | 3 | 64 | 2.400461 | 1.253990 | 1.257882 | 64×64×512 -> 1, f:3×3 |
28 | 32 | 8 | 5 | 128 | 0.943514 | 1.274675 | 0.467763 | 128×128×32 -> 8, f:5×5 |
45 | 512 | 1 | 3 | 128 | 2.519245 | 1.280614 | 1.275904 | 128×128×512 -> 1, f:3×3 |
7 | 3 | 128 | 1 | 512 | 3.371622 | 1.326080 | 2.317722 | 512×512×3 -> 128, f:1×1 |
17 | 3 | 512 | 1 | 256 | 2.973082 | 1.753088 | 1.833779 | 256×256×3 -> 512, f:1×1 |
41 | 3 | 128 | 5 | 64 | 0.328909 | 1.810022 | 0.183296 | 64×64×3 -> 128, f:5×5 |
9 | 3 | 128 | 5 | 128 | 1.266893 | 1.844634 | 0.679117 | 128×128×3 -> 128, f:5×5 |
15 | 32 | 4 | 3 | 512 | 3.226624 | 1.954816 | 1.873306 | 512×512×32 -> 4, f:3×3 |
40 | 8 | 512 | 1 | 64 | 0.394240 | 2.289254 | 0.242483 | 64×64×8 -> 512, f:1×1 |
49 | 32 | 8 | 5 | 256 | 3.610214 | 2.547712 | 1.794867 | 256×256×32 -> 8, f:5×5 |
8 | 3 | 8 | 5 | 1024 | 5.853798 | 2.647040 | 3.288064 | 1024×1024×3 -> 8, f:5×5 |
35 | 512 | 8 | 1 | 64 | 0.688538 | 2.897920 | 0.348160 | 64×64×512 -> 8, f:1×1 |
46 | 128 | 32 | 1 | 128 | 1.876787 | 3.093299 | 0.982426 | 128×128×128 -> 32, f:1×1 |
10 | 1 | 32 | 5 | 1024 | 9.511936 | 3.605504 | 4.901683 | 1024×1024×1 -> 32, f:5×5 |
29 | 3 | 32 | 3 | 1024 | 11.349197 | 4.278477 | 6.614630 | 1024×1024×3 -> 32, f:3×3 |
19 | 8 | 512 | 1 | 256 | 6.478848 | 4.831232 | 4.056064 | 256×256×8 -> 512, f:1×1 |
27 | 32 | 32 | 5 | 64 | 0.920986 | 5.430477 | 0.462234 | 64×64×32 -> 32, f:5×5 |
5 | 128 | 8 | 5 | 64 | 1.239859 | 5.528781 | 0.618906 | 64×64×128 -> 8, f:5×5 |
42 | 1 | 512 | 3 | 512 | 14.623539 | 6.328320 | 9.598566 | 512×512×1 -> 512, f:3×3 |
6 | 8 | 8 | 5 | 1024 | 16.636518 | 7.377920 | 8.506163 | 1024×1024×8 -> 8, f:5×5 |
39 | 32 | 512 | 1 | 128 | 6.386483 | 9.455616 | 3.391898 | 128×128×32 -> 512, f:1×1 |
36 | 32 | 128 | 3 | 128 | 5.712077 | 9.837773 | 3.195904 | 128×128×32 -> 128, f:3×3 |
4 | 32 | 128 | 3 | 256 | 27.863654 | 22.527795 | 15.678259 | 256×256×32 -> 128, f:3×3 |
31 | 512 | 8 | 5 | 128 | 17.193369 | 26.259456 | 7.303168 | 128×128×512 -> 8, f:5×5 |
2 | 32 | 512 | 3 | 64 | 5.996749 | 38.739558 | 3.200000 | 64×64×32 -> 512, f:3×3 |
11 | 512 | 4 | 5 | 512 | 134.970367 | 81.715004 | 68.868095 | 512×512×512 -> 4, f:5×5 |
For some reason, the version with shared memory is actually slower. Not entirely sure why, because it looks correct