import os
"CPU"] = "1"
os.environ["DEBUG"]="4"
os.environ[# os.environ["NOOPT"]="1"
from tinygrad import Tensor, dtypes
from tinygrad.ops import UOp, Ops, PatternMatcher, UPat, graph_rewrite
4 - The .arange()
insanity
Let’s try something light and fun - a simple arange.
= Tensor.arange(0.5, 2, 0.2) # Start, stop, step => [0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9]
a a.lazydata
UOp(Ops.ADD, dtypes.float, arg=None, src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(8,), src=(
UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=(
UOp(Ops.PERMUTE, dtypes.float, arg=(1, 0), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(8, 8), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(8, 8, 1), src=(
UOp(Ops.SHRINK, dtypes.float, arg=((0, 8), (0, 8)), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(8, 16), src=(
UOp(Ops.SHRINK, dtypes.float, arg=((0, 128),), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(135,), src=(
UOp(Ops.EXPAND, dtypes.float, arg=(9, 15), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(1, 15), src=(
UOp(Ops.PAD, dtypes.float, arg=((7, 0),), src=(
UOp(Ops.EXPAND, dtypes.float, arg=(8,), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(1,), src=(
UOp(Ops.CONST, dtypes.float, arg=0.2, src=(
x15:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),)),)),)),)),)),)),)),)),)),)),)),)),)),)),)),
UOp(Ops.EXPAND, dtypes.float, arg=(8,), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(1,), src=(
UOp(Ops.CONST, dtypes.float, arg=0.3, src=(
x15,)),)),)),))
Ohh, what the hell? I just want 8 numbers, what’s this insanity?
When something is not obvious, you can usually just go step by step.
float, arg=(8,), src=(
UOp(Ops.EXPAND, dtypes.float, arg=(1,), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=0.2, src=( UOp(Ops.CONST, dtypes.
0.2 is the start element (0.5) minus the step (0.3)
Next we pad it with 7 (n-1) 0 elements on the left and expand vertically to 9 (n+1):
float, arg=(9, 15), src=(
UOp(Ops.EXPAND, dtypes.float, arg=(1, 15), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=((7, 0),), src=( UOp(Ops.PAD, dtypes.
Next, we string it into a 135-element array, cut off the last 7 elements, making it end in ... 0, 0, 0, 0.2 ]
:
Then we reshape it back to a rectangle, now (8, 16). The transformation created an interesting pattern.
float, arg=(8, 16), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=((0, 128),), src=(
UOp(Ops.SHRINK, dtypes.float, arg=(135,), src=( UOp(Ops.RESHAPE, dtypes.
Next we take just the left half othe the pattern, and apply 3 transformations that don’t actually do anything.
We wil explore the source of those transforms later.
float, arg=(1, 0), src=(
UOp(Ops.PERMUTE, dtypes.float, arg=(8, 8), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(8, 8, 1), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=((0, 8), (0, 8)), src=( UOp(Ops.SHRINK, dtypes.
This was a long wat to get a triangular matrix (sad Linear Algebra noises).
But now that we have it, let’s sum over the elements in one axis (axis 1 in this case):
float, arg=(8,), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=(Ops.ADD, (1,)), src=( UOp(Ops.REDUCE_AXIS, dtypes.
I thin you see where this is going. Let’s conver the second branch of the top ADD
:
float, arg=(8,), src=(
UOp(Ops.EXPAND, dtypes.float, arg=(1,), src=(
UOp(Ops.RESHAPE, dtypes.float, arg=0.3, src=( UOp(Ops.CONST, dtypes.
And the final step - we add the 2 to get to our numbers!
float, arg=None, src=( UOp(Ops.ADD, dtypes.
Let’s have a look at the code I guess.
a.numpy()
opened device CPU from pid:154318
r_8_8
0: (8, 1) float.ptr(8) (1, 0) ShapeTracker(views=(View(shape=(8, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),))
[Opt(op=OptOps.UNROLL, axis=0, arg=0)]
void r_8_8(float* restrict data0) {
for (int ridx0 = 0; ridx0 < 8; ridx0++) {
*(data0+ridx0) = ((((ridx0<7)!=1)?0.2f:0.0f)+(((ridx0<6)!=1)?0.2f:0.0f)+(((ridx0<5)!=1)?0.2f:0.0f)+(((ridx0<4)!=1)?0.2f:0.0f)+(((ridx0<3)!=1)?0.2f:0.0f)+(((ridx0<2)!=1)?0.2f:0.0f)+(((ridx0<1)!=1)?0.2f:0.0f)+0.5f);
}
}
*** CPU 1 r_8_8 arg 1 mem 0.00 GB tm 2.79us/ 0.00ms ( 0.08 GFLOPS 0.0|0.0 GB/s) ['numpy', 'arange']
array([0.5 , 0.7 , 0.9 , 1.1 , 1.3 , 1.5 ,
1.7 , 1.9000001], dtype=float32)
Not gonna lie, this is an ugly way to do it. I suspect this could be improved.
void r_8_8(float* restrict data0) {
for (int ridx0 = 0; ridx0 < 8; ridx0++) {
*(data0 + ridx0) = ((((ridx0 < 7) != 1) ? 0.2f : 0.0f) +
(((ridx0 < 6) != 1) ? 0.2f : 0.0f) +
(((ridx0 < 5) != 1) ? 0.2f : 0.0f) +
(((ridx0 < 4) != 1) ? 0.2f : 0.0f) +
(((ridx0 < 3) != 1) ? 0.2f : 0.0f) +
(((ridx0 < 2) != 1) ? 0.2f : 0.0f) +
(((ridx0 < 1) != 1) ? 0.2f : 0.0f) + 0.5f);
}
}
This must be some optimization mis-behaving. With NOOPT=1
we get good code:
void r_8_8(float* restrict data0) {
for (int ridx0 = 0; ridx0 < 8; ridx0++) {
*(data0+ridx0) = ((((float)((ridx0+1)))*0.2f)+0.3f);
}
}
At least TinyGrad was able to optimize out all the instane transformations!
Let’s take a look at the code:
tensor.py
(simplified)
def arange(start, stop=None, step=1, **kwargs) -> Tensor:
= ... # Figure out the output dtype
dtype =ceildiv(stop-start, step)
output_lenreturn (Tensor.full((output_len,), step, dtype=dtype, **kwargs)._cumalu(0, Ops.ADD) + (start - step)).cast(dtype)
(Tensor.full((output_len,), step, dtype=dtype, **kwargs)
This creates the (8,) vector filled with0.2
(start - step)
This is the second branch of the top ADD._cumalu(0, Ops.ADD)
And all the magic happens here.
def _cumalu(self, axis:int, op:Ops, _include_initial=False) -> Tensor:
= self.shape[axis] - int(not _include_initial)
pl_sz = self.transpose(axis,-1).pad((pl_sz, -int(_include_initial)), value=identity_element(op, self.dtype))._pool((self.shape[axis],))
pooled
# For ADD:
return pooled.sum(-1).transpose(axis, -1)
pl_sz
will be 7self.transpose(axis,-1)
makes the axis we are interested in the last. In our case we only have 1 axis, so this does nothing, and gets optimized out immediately.pad((pl_sz, -int(_include_initial))
- Here is the pad that adds the 7 elements to the left filled with zeros (identity element for ADD)._pool((self.shape[axis],))
: Here be dragons
def _pool(self, k_:tuple[sint, ...], stride:int|tuple[int, ...]=1, dilation:int|tuple[int, ...]=1) -> Tensor:
assert len(self.shape) >= len(k_), f"can't pool {self.shape} with {k_}"
= make_tuple(stride, len(k_)), make_tuple(dilation, len(k_))
s_, d_ assert len(k_) == len(s_) == len(d_), f"stride/dilation mismatch kernel:{k_} stride:{s_} dilation:{d_}"
= [None] * (self.ndim-len(k_)), self.shape[-len(k_):]
noop, i_ assert all(resolve(d*(k-1)+1 <= i) for k,d,i in zip(k_,d_,i_)), "kernel size cannot be greater than actual input size"
= [ceildiv(i-d*(k-1), s) for i,d,k,s in zip(i_,d_,k_,s_)]
o_ if any(resolve(k > s) for k,s in zip(k_,s_)) or any(d != 1 for d in d_):
# input size scaling factor to make sure shrink for stride is possible
= [1 + int(resolve(o*s > (i - d*(k-1)))) for o,s,i,d,k in zip(o_,s_,i_,d_,k_)]
f_ # # repeats such that we don't need padding
= self.repeat([1]*len(noop) + [ceildiv(k*(i*f+d),i) for k,i,d,f in zip(k_,i_,d_,f_)])
x # handle dilation
= x.shrink(tuple(noop + [(0,k*(i*f+d)) for k,i,d,f in zip(k_,i_,d_,f_)])).reshape(noop + flatten((k,(i*f+d)) for k,i,d,f in zip(k_,i_,d_,f_)))
x # handle stride
= x.shrink(tuple(noop + flatten(((0,k), (0,o*s)) for k,o,s in zip(k_,o_,s_)))).reshape(noop + flatten((k,o,s) for k,o,s in zip(k_,o_,s_)))
x = x.shrink(tuple(noop + flatten(((0,k), (0,o), (0,1)) for k,o in zip(k_,o_)))).reshape(noop + flatten((k,o) for k,o in zip(k_,o_)))
x # permute to move reduce to the end
return x.permute(*range(len(noop)), *[len(noop)+i*2+1 for i in range(len(i_))], *[len(noop)+i*2 for i in range(len(i_))])
# TODO: once the shapetracker can optimize well, remove this alternative implementation
= self.pad(tuple(noop + [(0, max(0,o*s-i)) for i,o,s in zip(i_,o_,s_)])).shrink(tuple(noop + [(0,o*s) for o,s in zip(o_,s_)]))
x = x.reshape(noop + flatten(((o,s) for o,s in zip(o_,s_))))
x = x.shrink(tuple(noop + flatten(((0,o), (0,k)) for o,k in zip(o_,k_))))
x return x.permute(*range(len(noop)), *[len(noop)+i*2 for i in range(len(i_))], *[len(noop)+i*2+1 for i in range(len(i_))])
Lol I’m not going to pretend I can follow what’s going on here. This obviously does the crazy triangle.
pooled.sum(-1).transpose(axis, -1)
This must be that REDUCE_AXIS operation, and another transpose that does nothing in this case.
Sometimes I wish TinyGrad was easier to understand. :)