Binary elementwise ops

import numpy as np
from tidygrad.tensor import Tensor
from tidygrad.utils.grad_check import grad_check
def run_test_binary_elementwise(func, shape1, shape2=None, pos_only=False):
    """Test a binary elementwise function, like add, mul, etc"""
    shape2 = shape1 if shape2 is None else shape2
    if pos_only:
        a = Tensor(np.abs(np.random.randn(*shape1)) + 1e-8, name="a", requires_grad=True)
        b = Tensor(np.abs(np.random.randn(*shape2)) + 1e-8, name="b", requires_grad=True)
    else:
        a = Tensor(np.random.randn(*shape1), name="a", requires_grad=True)
        b = Tensor(np.random.randn(*shape2), name="b", requires_grad=True)

    t = func(inputs=None, params=(a, b))
    t.backward()
    grad_check(func=func, inputs=None, params=(a, b), verbose=False)
def run_test_unary_elementwise(func, shape, pos_only=False, offset=1e-3):
    """Test a unary elementwise function, like exp, log, etc"""
    if pos_only:
        # Mostly for log(a) - it's positive only and is instable too close to zero.
        a = Tensor(
            np.abs(np.random.randn(*shape)) + offset, name="a", requires_grad=True
        )
    else:
        a = Tensor(np.random.randn(*shape), name="a", requires_grad=True)

    t = func(inputs=None, params=(a,))
    t.backward()
    grad_check(func=func, inputs=None, params=(a,))
a = Tensor(np.random.randn(2, 3), name="a", requires_grad=True)
b = Tensor(np.random.randn(2, 3), name="b", requires_grad=True)

c = a + b

loss = c.sum()

loss.backward()
def add_func(inputs, params: tuple = ()):
    a, b = params
    loss = a.add(b, "t").sum()
    return loss

run_test_binary_elementwise(add_func, (1, 1))
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
def sub_func(inputs, params: tuple = ()):
    a, b = params
    loss = a.sub(b, "t").sum("loss")
    return loss

run_test_binary_elementwise(sub_func, (100, 100))
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
def mul_func(inputs, params: tuple = ()):
    a, b = params
    loss = a.mul(b, "t").sum("loss")
    return loss

run_test_binary_elementwise(mul_func, (100, 100))
Max fractional gradient difference for b: 0.0001%
Max fractional gradient difference for a: 0.0002%
def pow_func(inputs, params: tuple = ()):
    a = params[0]
    loss = a.pow(2, "t").sum("loss")
    return loss


def run_test_pow(shape):
    a = Tensor(np.random.randn(*shape), name="a", requires_grad=True)
    a.data = np.where(np.abs(a.data) < 1e-5, 1e-5, a.data)

    t = pow_func(inputs=None, params=(a,))

    t.backward()

    grad_check(func=pow_func, inputs=None, params=(a,))


# XXX pow is unstable for values close to zero
# run_test_pow((100, 100))

Unary elementwise functions

def log_func(inputs, params: tuple = ()):
    (a, ) = params

    loss = a.log("t").sum("loss")
    return loss

run_test_unary_elementwise(log_func, (100, 100), pos_only=True)
Max fractional gradient difference for a: 0.1248%
def exp_func(inputs, params: tuple = ()):
    (a, ) = params

    loss = a.exp("t").sum("loss")
    return loss

run_test_unary_elementwise(exp_func, (100, 100))
Max fractional gradient difference for a: 0.0028%
import tidygrad.func as F

# from tidygrad.func import relu, sigmoid, tanh, softmax, gelu, new_gelu
def sigmoid_func(inputs, params: tuple = ()):
    (a, ) = params
    t = F.sigmoid(a)
    return t.sum("loss")

run_test_unary_elementwise(sigmoid_func, (100, 100))
Max fractional gradient difference for a: 0.0005%
def tanh_func(inputs, params: tuple = ()):
    (a,) = params
    t = F.tanh(a)
    return t.sum("loss")


run_test_unary_elementwise(tanh_func, (100, 100))
Max fractional gradient difference for a: 0.0010%
def relu_func(inputs, params: tuple = ()):
    (a,) = params
    t = F.relu(a, "t")
    return t.sum("loss")


run_test_unary_elementwise(relu_func, (100, 100))
Max fractional gradient difference for a: 0.0000%
def gelu_func(inputs, params: tuple = ()):
    (a,) = params
    t = F.gelu(a)
    return t.sum("loss")


# XXX Stability issues
# run_test_unary_elementwise(gelu_func, (100, 100))
def softmax_func(inputs, params: tuple = ()):
    (a, ) = params
    n_batch, n_classes = a.shape
    y = np.zeros(a.shape)
    np.random.seed(42)
    y[np.arange(n_batch), np.random.randint(0, n_classes, n_batch)] = 1
    y = Tensor(y, name="y")
    sm = F.softmax(a, "t")

    cross_entropy = y * sm.log() + (1-y) * (1 - sm).log()
    #
    return cross_entropy.sum("loss")

run_test_unary_elementwise(softmax_func, (1, 5))
Max fractional gradient difference for a: 0.0007%
def matmul_func(inputs, params: tuple[Tensor] = ()):
    a, b = params
    t = a.mmul(b, "t")
    return t.sum("loss")

def run_test_matmul(shape1, shape2):
    a = Tensor(np.random.randn(*shape1), name="a", requires_grad=True)
    b = Tensor(np.random.randn(*shape2), name="b", requires_grad=True)
    t = matmul_func(inputs=None, params=(a, b))
    t.backward()

    grad_check(func=matmul_func, inputs=None, params=(a, b))

run_test_matmul((10, 100), (100, 50))
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%

Broadcasting

run_test_binary_elementwise(add_func, (2, 10, 1), (10, 100))
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
run_test_matmul((2, 10, 100), (100, 10))
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%

Test loss functions

# def lt_func(inputs, params: tuple = ()):
#     a, b = params
#     loss = (a < b).sum("loss")
#     return loss

# run_test_binary_elementwise(lt_func, (100, 100), (100, 100))

# a = Tensor(np.random.randn(100, 100), name="a")
# b = Tensor(np.random.randn(100, 100), name="b")

# t = lt_func(inputs=None, params=(a, b))
# t.backward()
# from tidygrad.functional import BCE_loss
def bceloss_func(inputs, params: tuple = ()):
    y = inputs[0]
    x = params[0]

    loss = F.BCE_loss(x, y).sum("loss")
    return loss

x = Tensor(np.random.randn(100), name="x", requires_grad=True)
y = Tensor(np.random.randn(100), name="y", requires_grad=True)

t = bceloss_func(inputs=(y, ), params=(x, ))
t.backward()

grad_check(func=bceloss_func, inputs=(y, ), params=(x, ))
Max fractional gradient difference for x: 0.0029%

Test Dropout

# from tidygrad.functional import dropout
def dropout_func(inputs, params: tuple = ()):
    p = params[0]

    np.random.seed(1337)
    t = F.dropout(p, 0.3, training=True)
    return t.sum("loss")

p = Tensor(np.random.randn(100), name="p", requires_grad=True)

t = dropout_func(inputs=None, params=(p, ))
t.backward()

grad_check(func=dropout_func, inputs=None, params=(p, ))
Max fractional gradient difference for p: 0.0000%
# from tidygrad.functional import embedding

Test Embedding

def embedding_func(inputs, params: tuple = ()):
    idxs = inputs[0]
    w = params[0]
    t = F.embedding(w, idxs, "t")
    return t.sum("loss")

idxs = [1, 2, 3, 4, 5, 6, 7, 8, 9]
w = Tensor(np.random.randn(10, 100), name="w", requires_grad=True)

t = embedding_func(inputs=(idxs, ), params=(w, ))
t.backward()

grad_check(func=embedding_func, inputs=(idxs, ), params=(w, ))
Max fractional gradient difference for w: 0.0000%

Test sum and mean and std

def sum_test(inputs, params: tuple = ()):
    a = params[0]
    t = a.sum("t")
    return t.sum("loss")

run_test_unary_elementwise(sum_test, (100, 100))
Max fractional gradient difference for a: 0.0000%
def mean_test(inputs, params: tuple = ()):
    a = params[0]
    t = a.mean("t")
    return t.sum("loss")

run_test_unary_elementwise(mean_test, (100, 100))
Max fractional gradient difference for a: 0.0000%
def std_test(inputs, params: tuple = ()):
    a = params[0]
    t = a.std("t")
    return t.sum("loss")


run_test_unary_elementwise(std_test, (100, 100))
Max fractional gradient difference for a: 0.0049%
a = Tensor(np.random.randn(100, 100), name="a", requires_grad=True)

a**3
Tensor[100, 100](name="" op=Pow parents=[a]):
    v=array[100, 100] n=10000 (78Kb) x∈[-41.412, 47.474] μ=0.066 σ=3.739
    ∇=array[100, 100] n=10000 (78Kb) all_zeros
# from tidygrad.functional import stack, concat
def stack_test(inputs, params: tuple = ()):
    t = F.stack(params, name="t")
    return t.sum("loss")


run_test_binary_elementwise(stack_test, (100, 100), (100, 100))
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
def concat_test(inputs, params: tuple = ()):
    t = F.concat(params, name="t")
    return t.sum("loss")


run_test_binary_elementwise(concat_test, (100, 100), (100, 100))
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for a: 0.0000%
from tidygrad.func import layer_norm
def layer_norm_test(inputs, params):
    a, w, b = params
    t = layer_norm(a, w, b)
    return t.sum("loss")

a = Tensor(np.random.randn(2, 100, 100), name="a", requires_grad=True)
w = Tensor(np.random.randn(100), name="w", requires_grad=True)
b = Tensor(np.random.randn(100), name="b", requires_grad=True)

t = layer_norm_test(inputs=None, params=(a, w, b))
t.backward()

grad_check(func=layer_norm_test, inputs=None, params=(a, w, b))
Max fractional gradient difference for b: 0.0000%
Max fractional gradient difference for w: 0.0000%
Max fractional gradient difference for a: 0.0074%