import numpy as np
from tidygrad.tensor import Tensor
from tidygrad.utils.grad_check import grad_checkBinary elementwise ops
def run_test_binary_elementwise(func, shape1, shape2=None, pos_only=False):
"""Test a binary elementwise function, like add, mul, etc"""
shape2 = shape1 if shape2 is None else shape2
if pos_only:
a = Tensor(np.abs(np.random.randn(*shape1)) + 1e-8, name="a", requires_grad=True)
b = Tensor(np.abs(np.random.randn(*shape2)) + 1e-8, name="b", requires_grad=True)
else:
a = Tensor(np.random.randn(*shape1), name="a", requires_grad=True)
b = Tensor(np.random.randn(*shape2), name="b", requires_grad=True)
t = func(inputs=None, params=(a, b))
t.backward()
grad_check(func=func, inputs=None, params=(a, b), verbose=False)def run_test_unary_elementwise(func, shape, pos_only=False, offset=1e-3):
"""Test a unary elementwise function, like exp, log, etc"""
if pos_only:
# Mostly for log(a) - it's positive only and is instable too close to zero.
a = Tensor(
np.abs(np.random.randn(*shape)) + offset, name="a", requires_grad=True
)
else:
a = Tensor(np.random.randn(*shape), name="a", requires_grad=True)
t = func(inputs=None, params=(a,))
t.backward()
grad_check(func=func, inputs=None, params=(a,))a = Tensor(np.random.randn(2, 3), name="a", requires_grad=True)
b = Tensor(np.random.randn(2, 3), name="b", requires_grad=True)
c = a + b
loss = c.sum()
loss.backward()def add_func(inputs, params: tuple = ()):
a, b = params
loss = a.add(b, "t").sum()
return loss
run_test_binary_elementwise(add_func, (1, 1))Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
def sub_func(inputs, params: tuple = ()):
a, b = params
loss = a.sub(b, "t").sum("loss")
return loss
run_test_binary_elementwise(sub_func, (100, 100))Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
def mul_func(inputs, params: tuple = ()):
a, b = params
loss = a.mul(b, "t").sum("loss")
return loss
run_test_binary_elementwise(mul_func, (100, 100))Max fractional gradient difference for b: 0.0001%
Max fractional gradient difference for a: 0.0002%
def pow_func(inputs, params: tuple = ()):
a = params[0]
loss = a.pow(2, "t").sum("loss")
return loss
def run_test_pow(shape):
a = Tensor(np.random.randn(*shape), name="a", requires_grad=True)
a.data = np.where(np.abs(a.data) < 1e-5, 1e-5, a.data)
t = pow_func(inputs=None, params=(a,))
t.backward()
grad_check(func=pow_func, inputs=None, params=(a,))
# XXX pow is unstable for values close to zero
# run_test_pow((100, 100))Unary elementwise functions
def log_func(inputs, params: tuple = ()):
(a, ) = params
loss = a.log("t").sum("loss")
return loss
run_test_unary_elementwise(log_func, (100, 100), pos_only=True)Max fractional gradient difference for a: 0.1248%
def exp_func(inputs, params: tuple = ()):
(a, ) = params
loss = a.exp("t").sum("loss")
return loss
run_test_unary_elementwise(exp_func, (100, 100))Max fractional gradient difference for a: 0.0028%
import tidygrad.func as F
# from tidygrad.func import relu, sigmoid, tanh, softmax, gelu, new_geludef sigmoid_func(inputs, params: tuple = ()):
(a, ) = params
t = F.sigmoid(a)
return t.sum("loss")
run_test_unary_elementwise(sigmoid_func, (100, 100))Max fractional gradient difference for a: 0.0005%
def tanh_func(inputs, params: tuple = ()):
(a,) = params
t = F.tanh(a)
return t.sum("loss")
run_test_unary_elementwise(tanh_func, (100, 100))Max fractional gradient difference for a: 0.0010%
def relu_func(inputs, params: tuple = ()):
(a,) = params
t = F.relu(a, "t")
return t.sum("loss")
run_test_unary_elementwise(relu_func, (100, 100))Max fractional gradient difference for a: 0.0000%
def gelu_func(inputs, params: tuple = ()):
(a,) = params
t = F.gelu(a)
return t.sum("loss")
# XXX Stability issues
# run_test_unary_elementwise(gelu_func, (100, 100))def softmax_func(inputs, params: tuple = ()):
(a, ) = params
n_batch, n_classes = a.shape
y = np.zeros(a.shape)
np.random.seed(42)
y[np.arange(n_batch), np.random.randint(0, n_classes, n_batch)] = 1
y = Tensor(y, name="y")
sm = F.softmax(a, "t")
cross_entropy = y * sm.log() + (1-y) * (1 - sm).log()
#
return cross_entropy.sum("loss")
run_test_unary_elementwise(softmax_func, (1, 5))Max fractional gradient difference for a: 0.0007%
def matmul_func(inputs, params: tuple[Tensor] = ()):
a, b = params
t = a.mmul(b, "t")
return t.sum("loss")
def run_test_matmul(shape1, shape2):
a = Tensor(np.random.randn(*shape1), name="a", requires_grad=True)
b = Tensor(np.random.randn(*shape2), name="b", requires_grad=True)
t = matmul_func(inputs=None, params=(a, b))
t.backward()
grad_check(func=matmul_func, inputs=None, params=(a, b))
run_test_matmul((10, 100), (100, 50))Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
Broadcasting
run_test_binary_elementwise(add_func, (2, 10, 1), (10, 100))Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
run_test_matmul((2, 10, 100), (100, 10))Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
Test loss functions
# def lt_func(inputs, params: tuple = ()):
# a, b = params
# loss = (a < b).sum("loss")
# return loss
# run_test_binary_elementwise(lt_func, (100, 100), (100, 100))
# a = Tensor(np.random.randn(100, 100), name="a")
# b = Tensor(np.random.randn(100, 100), name="b")
# t = lt_func(inputs=None, params=(a, b))
# t.backward()# from tidygrad.functional import BCE_lossdef bceloss_func(inputs, params: tuple = ()):
y = inputs[0]
x = params[0]
loss = F.BCE_loss(x, y).sum("loss")
return loss
x = Tensor(np.random.randn(100), name="x", requires_grad=True)
y = Tensor(np.random.randn(100), name="y", requires_grad=True)
t = bceloss_func(inputs=(y, ), params=(x, ))
t.backward()
grad_check(func=bceloss_func, inputs=(y, ), params=(x, ))Max fractional gradient difference for x: 0.0029%
Test Dropout
# from tidygrad.functional import dropoutdef dropout_func(inputs, params: tuple = ()):
p = params[0]
np.random.seed(1337)
t = F.dropout(p, 0.3, training=True)
return t.sum("loss")
p = Tensor(np.random.randn(100), name="p", requires_grad=True)
t = dropout_func(inputs=None, params=(p, ))
t.backward()
grad_check(func=dropout_func, inputs=None, params=(p, ))Max fractional gradient difference for p: 0.0000%
# from tidygrad.functional import embeddingTest Embedding
def embedding_func(inputs, params: tuple = ()):
idxs = inputs[0]
w = params[0]
t = F.embedding(w, idxs, "t")
return t.sum("loss")
idxs = [1, 2, 3, 4, 5, 6, 7, 8, 9]
w = Tensor(np.random.randn(10, 100), name="w", requires_grad=True)
t = embedding_func(inputs=(idxs, ), params=(w, ))
t.backward()
grad_check(func=embedding_func, inputs=(idxs, ), params=(w, ))Max fractional gradient difference for w: 0.0000%
Test sum and mean and std
def sum_test(inputs, params: tuple = ()):
a = params[0]
t = a.sum("t")
return t.sum("loss")
run_test_unary_elementwise(sum_test, (100, 100))Max fractional gradient difference for a: 0.0000%
def mean_test(inputs, params: tuple = ()):
a = params[0]
t = a.mean("t")
return t.sum("loss")
run_test_unary_elementwise(mean_test, (100, 100))Max fractional gradient difference for a: 0.0000%
def std_test(inputs, params: tuple = ()):
a = params[0]
t = a.std("t")
return t.sum("loss")
run_test_unary_elementwise(std_test, (100, 100))Max fractional gradient difference for a: 0.0049%
a = Tensor(np.random.randn(100, 100), name="a", requires_grad=True)
a**3Tensor[100, 100](name="" op=Pow parents=[a]):
v=array[100, 100] n=10000 (78Kb) x∈[-41.412, 47.474] μ=0.066 σ=3.739
∇=array[100, 100] n=10000 (78Kb) all_zeros
# from tidygrad.functional import stack, concatdef stack_test(inputs, params: tuple = ()):
t = F.stack(params, name="t")
return t.sum("loss")
run_test_binary_elementwise(stack_test, (100, 100), (100, 100))Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
def concat_test(inputs, params: tuple = ()):
t = F.concat(params, name="t")
return t.sum("loss")
run_test_binary_elementwise(concat_test, (100, 100), (100, 100))Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
from tidygrad.func import layer_normdef layer_norm_test(inputs, params):
a, w, b = params
t = layer_norm(a, w, b)
return t.sum("loss")
a = Tensor(np.random.randn(2, 100, 100), name="a", requires_grad=True)
w = Tensor(np.random.randn(100), name="w", requires_grad=True)
b = Tensor(np.random.randn(100), name="b", requires_grad=True)
t = layer_norm_test(inputs=None, params=(a, w, b))
t.backward()
grad_check(func=layer_norm_test, inputs=None, params=(a, w, b))Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for w: 0.0000%
Max fractional gradient difference for a: 0.0074%