def np_pad2d(a: np.ndarray, pad: tuple[int, int]) -> np.ndarray:
= pad
h, w = [(0, 0)] * (a.ndim - 2) + [(h, h), (w, w)]
pad_widths return np.pad(a, pad_widths, mode="constant")
def np_unpad2d(a: np.ndarray, pad: tuple[int, int]) -> np.ndarray:
= pad
h, w return a[..., h:-h, w:-w]
Functional operations: Convolution and Pooling
# Updated test for np_pad
= np.array([[1, 2], [3, 4]])
A = np_pad2d(A, (1, 1))
pad_A assert np.array_equal(pad_A, [[0, 0, 0, 0], [0, 1, 2, 0], [0, 3, 4, 0], [0, 0, 0, 0]])
= np_unpad2d(pad_A, (1, 1))
A_recovered assert np.array_equal(A, A_recovered)
Pad
Pad (a, padding:Union[int,Tuple[int,int]], name=None)
Pad a tensor
from matplotlib import pyplot as plt
import tidygrad as tg
= tg.Tensor(np.random.randn(1, 1, 4, 5), name="x")
x = Pad(x, 1).out
bias
0, 0, :, :]); plt.imshow(bias.data[
def np_conv2d(input: np.ndarray, kernel: np.ndarray, bias: np.ndarray):
= input.shape
batch, input_channels, input_height, input_width = kernel.shape
output_channels, _input_channels, kernel_height, kernel_width
assert kernel_height % 2 == 1, "Only odd kernel sizes are supported for"
assert kernel_width % 2 == 1, "Only odd kernel sizes are supported for"
assert (input_channels == _input_channels), f"Input channels mismatch: {input_channels} != {_input_channels}"
assert bias.shape == (output_channels, ), f"Invalid bias shape: {bias.shape}"
# bias = bias / (kernel_height * kernel_width)
= input_height - kernel_height + 1
output_height = input_width - kernel_width + 1
output_width
= np.zeros((batch, output_channels, output_height, output_width), dtype=input.dtype)
y for r in range(output_height):
for c in range(output_width):
= input[..., r:r + kernel_height, c:c + kernel_width]
region = np.sum(region * kernel, axis=(-1, -2, -3)) + bias
y[..., r, c] return y
def np_kernel_grad(input: np.ndarray, kernel: np.ndarray, bias: np.ndarray, grad: np.ndarray):
= input.shape
batch, input_channels, input_height, input_width = kernel.shape
output_channels, _input_channels, kernel_height, kernel_width
assert kernel_height % 2 == 1, "Only odd kernel sizes are supported for"
assert kernel_width % 2 == 1, "Only odd kernel sizes are supported for"
assert (input_channels == _input_channels), f"Input channels mismatch: {input_channels} != {_input_channels}"
assert bias.shape == (output_channels, ), f"Invalid bias shape: {bias.shape}"
= input_height - kernel_height + 1
output_height = input_width - kernel_width + 1
output_width
# padded_input # [ batch, input_channels, input_height, input_width
# kernel # [ output_channels, input_channels, kernel_height, kernel_width ]
# grad # [ batch, output_channels, output_height, output_width ]
# assert kernel_width == kernel_height, "Only square kernels are supported for now"
assert grad.shape[-2:] == (
output_height,
output_width,f"Invalid grad shape: {grad.shape}"
),
= np.zeros_like(kernel)
grad_w for r in range(output_height):
for c in range(output_width):
= input[:, :, r:r + kernel_height, c:c + kernel_width]
p
for ch in range(0, output_channels):
= grad[:, ch, r, c]
q += (p * q).sum(axis=0)
grad_w[ch, :, :, :]
return grad_w
Relationship Between Padding, Kernel Size, and Deconvolution
For a convolution operation, given:
- Input size \(N\)
- Kernel size \(k\) (where \(k \leq N\))
- Padding \(p\) (applied symmetrically on all sides)
The padded input will have a size of \(N + 2p\).
After applying the convolution with the kernel, the output \(y\) will have a size:
\[ N_y = N + 2p - k + 1 \]
Back to Input Domain (Deconvolution)
Before convolving back to the input domain, we apply a new padding \(p'\) (unknown at this point) to \(N_y\), resulting in a length of \(N_y + 2p'\).
After the deconvolution, we get:
\[ (N_y + 2p') - k + 1 = N + 2p - k + 1 + 2p' - k + 1 \]
Simplifying, we find:
\[ N + 2(p + p') - 2k + 2 = N \\ 2(p + p') - 2k + 2 = 0 \\ p + p' = k - 1 \\ p' = k - p - 1 \\ \]
This equation gives us the value of \(p'\) required to get back to the input domain size \(N\) after deconvolution.
Back-propagation through a convolutional layer
For a convolution operation, given:
(even kernels only, stride size one)
- Input size \(N\)
- Kernel size \(k\) (where \(k \leq N\))
Forward pass:
- Output size will be \(N - (k+1)\)
For example: Input 7x7, kernel 3 -> output 5x5 Input 11x11, kernel 5 -> output 7x7
Back pass, calculating the gradient for the input.
The shape of the chain gradient will be the same the output.
We are going to apply a convolution to it with the rotated and transposed kernel, so we need to pad the gradient for the convolution to produce an output of input size.
for example (Forward ) Input 7x7, kernel 3 -> output 5x5 (Backward) Gradient 5x5, padded by P, kernel 3 -> output should be 7x7
Thus 2P (padding on both sides) should be input_size - output_size + (kernel_size - 1)
And P should be (input_size - output_size + (kernel_size - 1)) / 2
Note that (kernel_size - 1) is always even, and input_size and output_size are either both even or both odd, so this divides evenly.
class Conv2D(BaseOp):
def __init__(self, input, kernel, bias, stride=1, padding=0, name=None):
super().__init__(input, kernel, bias, name=name)
assert stride == 1, "Only stride=1 is supported for now"
if 1 in kernel.data.shape[-2:]:
assert padding == 0, "Padding is not supported for 1x1 kernels"
self.stride = stride
self.padding = (padding, padding) if isinstance(padding, int) else padding
self.parents = [*self.args] if self.requires_grad else []
self.input_data_padded = np_pad2d(self.args[0].data, self.padding)
= np_conv2d(self.input_data_padded, kernel.data, bias.data)
data
self.set_out(data)
# self.out = Tensor(data, name=self.name, op=self)
def backward(self):
= lambda x: np.rot90(x, 2, axes=(-1, -2)).transpose(1, 0, 2, 3)
rot180 input, kernel, bias = self.args
= self.out.grad
grad
= (input.data.shape[-2] - self.out.data.shape[-2] + (kernel.data.shape[-2] - 1))
pad_h = (input.data.shape[-1] - self.out.data.shape[-1] + (kernel.data.shape[-1] - 1))
pad_w
assert pad_h % 2 == 0, "Invalid padding"
assert pad_w % 2 == 0, "Invalid padding"
//= 2
pad_h //= 2
pad_w
= np_pad2d(grad, (pad_h, pad_w))
padded_grad input.accum_grad(np_conv2d(padded_grad, rot180(kernel.data), np.zeros(input.shape[1])))
self.input_data_padded, kernel.data, bias.data, grad))
kernel.accum_grad(np_kernel_grad(sum(axis=(0, -1, -2))) bias.accum_grad(grad.
from tidygrad.utils.grad_check import grad_check
= 12
h = 12
w = 3
kernel_h = 3
kernel_w
= 2
in_ch = 5
out_ch = 1
pad
= tg.Tensor(np.random.randn(in_ch, h, w)[None], "x", requires_grad=True)
x
filter = np.random.randn(out_ch, in_ch, kernel_h, kernel_w)
= tg.Tensor(filter, "w", requires_grad=True)
kernel
= tg.Tensor(np.random.randn(out_ch), "b", requires_grad=True)
bias = Conv2D(x, kernel, bias, padding=pad).out
conv = conv.mean()
loss
loss.backward()
def func(inputs, params):
= params
x, kernel, bias return Conv2D(x, kernel, bias, padding=pad).out.mean()
None, (x, kernel, bias)) grad_check(func,
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for w: 0.0000%
Max fractional gradient difference for x: 0.0000%
from tidygrad.utils.datasets import MNIST
= MNIST()
mnist
= tg.Tensor(mnist[1][0][None][None], "x")
x 0, 0]))
plt.show(plt.imshow(x.data[
filter = np.array([[1.0, 0, -1], [1, 0, -1], [1, 0, -1]])[None, None]
= tg.Tensor(filter.reshape(1, 1, 3, 3), "w")
kernel = tg.Tensor(np.zeros((1, )), "b")
bias
= Conv2D(x, kernel, bias, padding=10).out
conv # conv = Conv2D(conv, kernel, bias, padding=1).out
# conv = Conv2D(conv, kernel, bias, padding=1).out
0, 0, :, :]) plt.imshow(conv.data[
<matplotlib.image.AxesImage>
class MaxPool2D(UnaryElementwiseOp):
def __init__(self, a, kernel_size, name=None):
super().__init__(a, name=name)
self.kernel_size = kernel_size
= np.arange(0, 36).reshape(1, 1, 6, 6)
A 1, 1, 3, 3, 2, 2)
A.reshape(
print(A)
= 2
block_size
= A.shape
shape = shape[:-2] + (
new_shape -2] // block_size,
shape[
block_size,-1] // block_size,
shape[
block_size,
)-2, -3)
A.reshape(new_shape).swapaxes(
def factorial(a):
return np.prod(np.arange)
[[[[ 0 1 2 3 4 5]
[ 6 7 8 9 10 11]
[12 13 14 15 16 17]
[18 19 20 21 22 23]
[24 25 26 27 28 29]
[30 31 32 33 34 35]]]]
= np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
A
= A.reshape(2, 2, 2, 2)
A print(A)
-2, -3) A.swapaxes(
[[[[ 1 2]
[ 3 4]]
[[ 5 6]
[ 7 8]]]
[[[ 9 10]
[11 12]]
[[13 14]
[15 16]]]]
array([[[[ 1, 2],
[ 5, 6]],
[[ 3, 4],
[ 7, 8]]],
[[[ 9, 10],
[13, 14]],
[[11, 12],
[15, 16]]]])