import numpy as np
from tidygrad.tensor import Tensor
from tidygrad.utils.grad_check import grad_check
Binary elementwise ops
def run_test_binary_elementwise(func, shape1, shape2=None, pos_only=False):
"""Test a binary elementwise function, like add, mul, etc"""
= shape1 if shape2 is None else shape2
shape2 if pos_only:
= Tensor(np.abs(np.random.randn(*shape1)) + 1e-8, name="a", requires_grad=True)
a = Tensor(np.abs(np.random.randn(*shape2)) + 1e-8, name="b", requires_grad=True)
b else:
= Tensor(np.random.randn(*shape1), name="a", requires_grad=True)
a = Tensor(np.random.randn(*shape2), name="b", requires_grad=True)
b
= func(inputs=None, params=(a, b))
t
t.backward()=func, inputs=None, params=(a, b), verbose=False) grad_check(func
def run_test_unary_elementwise(func, shape, pos_only=False, offset=1e-3):
"""Test a unary elementwise function, like exp, log, etc"""
if pos_only:
# Mostly for log(a) - it's positive only and is instable too close to zero.
= Tensor(
a abs(np.random.randn(*shape)) + offset, name="a", requires_grad=True
np.
)else:
= Tensor(np.random.randn(*shape), name="a", requires_grad=True)
a
= func(inputs=None, params=(a,))
t
t.backward()=func, inputs=None, params=(a,)) grad_check(func
= Tensor(np.random.randn(2, 3), name="a", requires_grad=True)
a = Tensor(np.random.randn(2, 3), name="b", requires_grad=True)
b
= a + b
c
= c.sum()
loss
loss.backward()
def add_func(inputs, params: tuple = ()):
= params
a, b = a.add(b, "t").sum()
loss return loss
1, 1)) run_test_binary_elementwise(add_func, (
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
def sub_func(inputs, params: tuple = ()):
= params
a, b = a.sub(b, "t").sum("loss")
loss return loss
100, 100)) run_test_binary_elementwise(sub_func, (
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
def mul_func(inputs, params: tuple = ()):
= params
a, b = a.mul(b, "t").sum("loss")
loss return loss
100, 100)) run_test_binary_elementwise(mul_func, (
Max fractional gradient difference for b: 0.0001%
Max fractional gradient difference for a: 0.0002%
def pow_func(inputs, params: tuple = ()):
= params[0]
a = a.pow(2, "t").sum("loss")
loss return loss
def run_test_pow(shape):
= Tensor(np.random.randn(*shape), name="a", requires_grad=True)
a = np.where(np.abs(a.data) < 1e-5, 1e-5, a.data)
a.data
= pow_func(inputs=None, params=(a,))
t
t.backward()
=pow_func, inputs=None, params=(a,))
grad_check(func
# XXX pow is unstable for values close to zero
# run_test_pow((100, 100))
Unary elementwise functions
def log_func(inputs, params: tuple = ()):
= params
(a, )
= a.log("t").sum("loss")
loss return loss
100, 100), pos_only=True) run_test_unary_elementwise(log_func, (
Max fractional gradient difference for a: 0.1248%
def exp_func(inputs, params: tuple = ()):
= params
(a, )
= a.exp("t").sum("loss")
loss return loss
100, 100)) run_test_unary_elementwise(exp_func, (
Max fractional gradient difference for a: 0.0028%
import tidygrad.func as F
# from tidygrad.func import relu, sigmoid, tanh, softmax, gelu, new_gelu
def sigmoid_func(inputs, params: tuple = ()):
= params
(a, ) = F.sigmoid(a)
t return t.sum("loss")
100, 100)) run_test_unary_elementwise(sigmoid_func, (
Max fractional gradient difference for a: 0.0005%
def tanh_func(inputs, params: tuple = ()):
= params
(a,) = F.tanh(a)
t return t.sum("loss")
100, 100)) run_test_unary_elementwise(tanh_func, (
Max fractional gradient difference for a: 0.0010%
def relu_func(inputs, params: tuple = ()):
= params
(a,) = F.relu(a, "t")
t return t.sum("loss")
100, 100)) run_test_unary_elementwise(relu_func, (
Max fractional gradient difference for a: 0.0000%
def gelu_func(inputs, params: tuple = ()):
= params
(a,) = F.gelu(a)
t return t.sum("loss")
# XXX Stability issues
# run_test_unary_elementwise(gelu_func, (100, 100))
def softmax_func(inputs, params: tuple = ()):
= params
(a, ) = a.shape
n_batch, n_classes = np.zeros(a.shape)
y 42)
np.random.seed(0, n_classes, n_batch)] = 1
y[np.arange(n_batch), np.random.randint(= Tensor(y, name="y")
y = F.softmax(a, "t")
sm
= y * sm.log() + (1-y) * (1 - sm).log()
cross_entropy #
return cross_entropy.sum("loss")
1, 5)) run_test_unary_elementwise(softmax_func, (
Max fractional gradient difference for a: 0.0007%
def matmul_func(inputs, params: tuple[Tensor] = ()):
= params
a, b = a.mmul(b, "t")
t return t.sum("loss")
def run_test_matmul(shape1, shape2):
= Tensor(np.random.randn(*shape1), name="a", requires_grad=True)
a = Tensor(np.random.randn(*shape2), name="b", requires_grad=True)
b = matmul_func(inputs=None, params=(a, b))
t
t.backward()
=matmul_func, inputs=None, params=(a, b))
grad_check(func
10, 100), (100, 50)) run_test_matmul((
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
Broadcasting
2, 10, 1), (10, 100)) run_test_binary_elementwise(add_func, (
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
2, 10, 100), (100, 10)) run_test_matmul((
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
Test loss functions
# def lt_func(inputs, params: tuple = ()):
# a, b = params
# loss = (a < b).sum("loss")
# return loss
# run_test_binary_elementwise(lt_func, (100, 100), (100, 100))
# a = Tensor(np.random.randn(100, 100), name="a")
# b = Tensor(np.random.randn(100, 100), name="b")
# t = lt_func(inputs=None, params=(a, b))
# t.backward()
# from tidygrad.functional import BCE_loss
def bceloss_func(inputs, params: tuple = ()):
= inputs[0]
y = params[0]
x
= F.BCE_loss(x, y).sum("loss")
loss return loss
= Tensor(np.random.randn(100), name="x", requires_grad=True)
x = Tensor(np.random.randn(100), name="y", requires_grad=True)
y
= bceloss_func(inputs=(y, ), params=(x, ))
t
t.backward()
=bceloss_func, inputs=(y, ), params=(x, )) grad_check(func
Max fractional gradient difference for x: 0.0029%
Test Dropout
# from tidygrad.functional import dropout
def dropout_func(inputs, params: tuple = ()):
= params[0]
p
1337)
np.random.seed(= F.dropout(p, 0.3, training=True)
t return t.sum("loss")
= Tensor(np.random.randn(100), name="p", requires_grad=True)
p
= dropout_func(inputs=None, params=(p, ))
t
t.backward()
=dropout_func, inputs=None, params=(p, )) grad_check(func
Max fractional gradient difference for p: 0.0000%
# from tidygrad.functional import embedding
Test Embedding
def embedding_func(inputs, params: tuple = ()):
= inputs[0]
idxs = params[0]
w = F.embedding(w, idxs, "t")
t return t.sum("loss")
= [1, 2, 3, 4, 5, 6, 7, 8, 9]
idxs = Tensor(np.random.randn(10, 100), name="w", requires_grad=True)
w
= embedding_func(inputs=(idxs, ), params=(w, ))
t
t.backward()
=embedding_func, inputs=(idxs, ), params=(w, )) grad_check(func
Max fractional gradient difference for w: 0.0000%
Test sum and mean and std
def sum_test(inputs, params: tuple = ()):
= params[0]
a = a.sum("t")
t return t.sum("loss")
100, 100)) run_test_unary_elementwise(sum_test, (
Max fractional gradient difference for a: 0.0000%
def mean_test(inputs, params: tuple = ()):
= params[0]
a = a.mean("t")
t return t.sum("loss")
100, 100)) run_test_unary_elementwise(mean_test, (
Max fractional gradient difference for a: 0.0000%
def std_test(inputs, params: tuple = ()):
= params[0]
a = a.std("t")
t return t.sum("loss")
100, 100)) run_test_unary_elementwise(std_test, (
Max fractional gradient difference for a: 0.0049%
= Tensor(np.random.randn(100, 100), name="a", requires_grad=True)
a
**3 a
Tensor[100, 100](name="" op=Pow parents=[a]):
v=array[100, 100] n=10000 (78Kb) x∈[-41.412, 47.474] μ=0.066 σ=3.739
∇=array[100, 100] n=10000 (78Kb) all_zeros
# from tidygrad.functional import stack, concat
def stack_test(inputs, params: tuple = ()):
= F.stack(params, name="t")
t return t.sum("loss")
100, 100), (100, 100)) run_test_binary_elementwise(stack_test, (
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
def concat_test(inputs, params: tuple = ()):
= F.concat(params, name="t")
t return t.sum("loss")
100, 100), (100, 100)) run_test_binary_elementwise(concat_test, (
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
from tidygrad.func import layer_norm
def layer_norm_test(inputs, params):
= params
a, w, b = layer_norm(a, w, b)
t return t.sum("loss")
= Tensor(np.random.randn(2, 100, 100), name="a", requires_grad=True)
a = Tensor(np.random.randn(100), name="w", requires_grad=True)
w = Tensor(np.random.randn(100), name="b", requires_grad=True)
b
= layer_norm_test(inputs=None, params=(a, w, b))
t
t.backward()
=layer_norm_test, inputs=None, params=(a, w, b)) grad_check(func
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for w: 0.0000%
Max fractional gradient difference for a: 0.0074%