4 - The `.arange()` insanity

import os
os.environ["CPU"] = "1"
os.environ["DEBUG"]="4"
# os.environ["NOOPT"]="1"

from tinygrad import Tensor, dtypes
from tinygrad.ops import UOp, Ops, PatternMatcher, UPat, graph_rewrite

Let’s try something light and fun - a simple arange.

a = Tensor.arange(0.5, 2, 0.2) # Start, stop, step => [0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9]
a.lazydata

UOp(Ops.ADD, dtypes.float, arg=None, src=(
  UOp(Ops.RESHAPE, dtypes.float, arg=(8,), src=(
    UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=(
      UOp(Ops.PERMUTE, dtypes.float, arg=(1, 0), src=(
        UOp(Ops.RESHAPE, dtypes.float, arg=(8, 8), src=(
          UOp(Ops.RESHAPE, dtypes.float, arg=(8, 8, 1), src=(
            UOp(Ops.SHRINK, dtypes.float, arg=((0, 8), (0, 8)), src=(
              UOp(Ops.RESHAPE, dtypes.float, arg=(8, 16), src=(
                UOp(Ops.SHRINK, dtypes.float, arg=((0, 128),), src=(
                  UOp(Ops.RESHAPE, dtypes.float, arg=(135,), src=(
                    UOp(Ops.EXPAND, dtypes.float, arg=(9, 15), src=(
                      UOp(Ops.RESHAPE, dtypes.float, arg=(1, 15), src=(
                        UOp(Ops.PAD, dtypes.float, arg=((7, 0),), src=(
                          UOp(Ops.EXPAND, dtypes.float, arg=(8,), src=(
                            UOp(Ops.RESHAPE, dtypes.float, arg=(1,), src=(
                              UOp(Ops.CONST, dtypes.float, arg=0.2, src=(
                                x15:=UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(), strides=(), offset=0, mask=None, contiguous=True),)), src=(
                                  UOp(Ops.DEVICE, dtypes.void, arg='CPU', src=()),)),)),)),)),)),)),)),)),)),)),)),)),)),)),)),)),
  UOp(Ops.EXPAND, dtypes.float, arg=(8,), src=(
    UOp(Ops.RESHAPE, dtypes.float, arg=(1,), src=(
      UOp(Ops.CONST, dtypes.float, arg=0.3, src=(
         x15,)),)),)),))

Ohh, what the hell? I just want 8 numbers, what’s this insanity?

When something is not obvious, you can usually just go step by step.

                          UOp(Ops.EXPAND, dtypes.float, arg=(8,), src=(
                            UOp(Ops.RESHAPE, dtypes.float, arg=(1,), src=(
                              UOp(Ops.CONST, dtypes.float, arg=0.2, src=(

0.2 is the start element (0.5) minus the step (0.3)

Next we pad it with 7 (n-1) 0 elements on the left and expand vertically to 9 (n+1):

                    UOp(Ops.EXPAND, dtypes.float, arg=(9, 15), src=(
                      UOp(Ops.RESHAPE, dtypes.float, arg=(1, 15), src=(
                        UOp(Ops.PAD, dtypes.float, arg=((7, 0),), src=(

Next, we string it into a 135-element array, cut off the last 7 elements, making it end in ... 0, 0, 0, 0.2 ]:

Then we reshape it back to a rectangle, now (8, 16). The transformation created an interesting pattern.

              UOp(Ops.RESHAPE, dtypes.float, arg=(8, 16), src=(
                UOp(Ops.SHRINK, dtypes.float, arg=((0, 128),), src=(
                  UOp(Ops.RESHAPE, dtypes.float, arg=(135,), src=(

Next we take just the left half othe the pattern, and apply 3 transformations that don’t actually do anything.

We wil explore the source of those transforms later.

      UOp(Ops.PERMUTE, dtypes.float, arg=(1, 0), src=(
        UOp(Ops.RESHAPE, dtypes.float, arg=(8, 8), src=(
          UOp(Ops.RESHAPE, dtypes.float, arg=(8, 8, 1), src=(
            UOp(Ops.SHRINK, dtypes.float, arg=((0, 8), (0, 8)), src=(

This was a long wat to get a triangular matrix (sad Linear Algebra noises).

But now that we have it, let’s sum over the elements in one axis (axis 1 in this case):

  UOp(Ops.RESHAPE, dtypes.float, arg=(8,), src=(
    UOp(Ops.REDUCE_AXIS, dtypes.float, arg=(Ops.ADD, (1,)), src=(

I thin you see where this is going. Let’s conver the second branch of the top ADD:

  UOp(Ops.EXPAND, dtypes.float, arg=(8,), src=(
    UOp(Ops.RESHAPE, dtypes.float, arg=(1,), src=(
      UOp(Ops.CONST, dtypes.float, arg=0.3, src=(

And the final step - we add the 2 to get to our numbers!

UOp(Ops.ADD, dtypes.float, arg=None, src=(

Let’s have a look at the code I guess.

a.numpy()

opened device CPU from pid:154318
r_8_8
 0: (8, 1)                    float.ptr(8)         (1, 0)                         ShapeTracker(views=(View(shape=(8, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),))
[Opt(op=OptOps.UNROLL, axis=0, arg=0)]

void r_8_8(float* restrict data0) {
  for (int ridx0 = 0; ridx0 < 8; ridx0++) {
    *(data0+ridx0) = ((((ridx0<7)!=1)?0.2f:0.0f)+(((ridx0<6)!=1)?0.2f:0.0f)+(((ridx0<5)!=1)?0.2f:0.0f)+(((ridx0<4)!=1)?0.2f:0.0f)+(((ridx0<3)!=1)?0.2f:0.0f)+(((ridx0<2)!=1)?0.2f:0.0f)+(((ridx0<1)!=1)?0.2f:0.0f)+0.5f);
  }
}

*** CPU        1 r_8_8                                     arg  1 mem  0.00 GB tm      2.79us/     0.00ms (     0.08 GFLOPS    0.0|0.0     GB/s) ['numpy', 'arange']

array([0.5      , 0.7      , 0.9      , 1.1      , 1.3      , 1.5      ,
       1.7      , 1.9000001], dtype=float32)

Not gonna lie, this is an ugly way to do it. I suspect this could be improved.

void r_8_8(float* restrict data0) {
  for (int ridx0 = 0; ridx0 < 8; ridx0++) {
    *(data0 + ridx0) = ((((ridx0 < 7) != 1) ? 0.2f : 0.0f) +
                        (((ridx0 < 6) != 1) ? 0.2f : 0.0f) +
                        (((ridx0 < 5) != 1) ? 0.2f : 0.0f) +
                        (((ridx0 < 4) != 1) ? 0.2f : 0.0f) +
                        (((ridx0 < 3) != 1) ? 0.2f : 0.0f) +
                        (((ridx0 < 2) != 1) ? 0.2f : 0.0f) +
                        (((ridx0 < 1) != 1) ? 0.2f : 0.0f) + 0.5f);
  }
}

This must be some optimization mis-behaving. With NOOPT=1 we get good code:

void r_8_8(float* restrict data0) {
  for (int ridx0 = 0; ridx0 < 8; ridx0++) {
    *(data0+ridx0) = ((((float)((ridx0+1)))*0.2f)+0.3f);
  }
}

At least TinyGrad was able to optimize out all the instane transformations!

Let’s take a look at the code:

tensor.py (simplified)

  def arange(start, stop=None, step=1, **kwargs) -> Tensor:
    dtype = ...              # Figure out the output dtype
    output_len=ceildiv(stop-start, step)
    return (Tensor.full((output_len,), step, dtype=dtype, **kwargs)._cumalu(0, Ops.ADD) + (start - step)).cast(dtype)

(Tensor.full((output_len,), step, dtype=dtype, **kwargs) This creates the (8,) vector filled with 0.2
(start - step) This is the second branch of the top ADD.
_cumalu(0, Ops.ADD) And all the magic happens here.

  def _cumalu(self, axis:int, op:Ops, _include_initial=False) -> Tensor:
    pl_sz = self.shape[axis] - int(not _include_initial)
    pooled = self.transpose(axis,-1).pad((pl_sz, -int(_include_initial)), value=identity_element(op, self.dtype))._pool((self.shape[axis],))

    # For ADD:
    return pooled.sum(-1).transpose(axis, -1)

pl_sz will be 7
self.transpose(axis,-1) makes the axis we are interested in the last. In our case we only have 1 axis, so this does nothing, and gets optimized out immediately.
pad((pl_sz, -int(_include_initial)) - Here is the pad that adds the 7 elements to the left filled with zeros (identity element for ADD).
_pool((self.shape[axis],)): Here be dragons

  def _pool(self, k_:tuple[sint, ...], stride:int|tuple[int, ...]=1, dilation:int|tuple[int, ...]=1) -> Tensor:
    assert len(self.shape) >= len(k_), f"can't pool {self.shape} with {k_}"
    s_, d_ = make_tuple(stride, len(k_)), make_tuple(dilation, len(k_))
    assert len(k_) == len(s_) == len(d_), f"stride/dilation mismatch kernel:{k_} stride:{s_} dilation:{d_}"
    noop, i_ = [None] * (self.ndim-len(k_)), self.shape[-len(k_):]
    assert all(resolve(d*(k-1)+1 <= i) for k,d,i in zip(k_,d_,i_)), "kernel size cannot be greater than actual input size"
    o_ = [ceildiv(i-d*(k-1), s) for i,d,k,s in zip(i_,d_,k_,s_)]
    if any(resolve(k > s) for k,s in zip(k_,s_)) or any(d != 1 for d in d_):
      # input size scaling factor to make sure shrink for stride is possible
      f_ = [1 + int(resolve(o*s > (i - d*(k-1)))) for o,s,i,d,k in zip(o_,s_,i_,d_,k_)]
      # # repeats such that we don't need padding
      x = self.repeat([1]*len(noop) + [ceildiv(k*(i*f+d),i) for k,i,d,f in zip(k_,i_,d_,f_)])
      # handle dilation
      x = x.shrink(tuple(noop + [(0,k*(i*f+d)) for k,i,d,f in zip(k_,i_,d_,f_)])).reshape(noop + flatten((k,(i*f+d)) for k,i,d,f in zip(k_,i_,d_,f_)))
      # handle stride
      x = x.shrink(tuple(noop + flatten(((0,k), (0,o*s)) for k,o,s in zip(k_,o_,s_)))).reshape(noop + flatten((k,o,s) for k,o,s in zip(k_,o_,s_)))
      x = x.shrink(tuple(noop + flatten(((0,k), (0,o), (0,1)) for k,o in zip(k_,o_)))).reshape(noop + flatten((k,o) for k,o in zip(k_,o_)))
      # permute to move reduce to the end
      return x.permute(*range(len(noop)), *[len(noop)+i*2+1 for i in range(len(i_))], *[len(noop)+i*2 for i in range(len(i_))])
    # TODO: once the shapetracker can optimize well, remove this alternative implementation
    x = self.pad(tuple(noop + [(0, max(0,o*s-i)) for i,o,s in zip(i_,o_,s_)])).shrink(tuple(noop + [(0,o*s) for o,s in zip(o_,s_)]))
    x = x.reshape(noop + flatten(((o,s) for o,s in zip(o_,s_))))
    x = x.shrink(tuple(noop + flatten(((0,o), (0,k)) for o,k in zip(o_,k_))))
    return x.permute(*range(len(noop)), *[len(noop)+i*2 for i in range(len(i_))], *[len(noop)+i*2+1 for i in range(len(i_))])

Lol I’m not going to pretend I can follow what’s going on here. This obviously does the crazy triangle.

pooled.sum(-1).transpose(axis, -1) This must be that REDUCE_AXIS operation, and another transpose that does nothing in this case.

Sometimes I wish TinyGrad was easier to understand. :)