import tidygrad as tg
from tidygrad import Tensor
import tidygrad.tensor
import numpy as np
import huggingface_hub
import datasets
GPT2-Nano training
# ds = datasets.load_dataset("roneneldan/TinyStories")
= 1024
n_vocab = 2
n_layers = 4
n_heads = 128
ndim = 32 ctx_len
def gpt2_new(n_vocab, n_layers, n_heads, ndim):
= {
shape_dict "wte": [n_vocab, ndim],
"wpe": [ctx_len, ndim],
"ln_f.weight": [ndim],
"ln_f.bias": [ndim],
}
for i in range(n_layers):
f"h.{i}.ln_1.weight"] = [ndim]
shape_dict[f"h.{i}.ln_1.bias"] = [ndim]
shape_dict[
f"h.{i}.attn.c_attn.weight"] = [ndim, 3 * ndim]
shape_dict[f"h.{i}.attn.c_attn.bias"] = [3 * ndim]
shape_dict[
f"h.{i}.attn.c_proj.weight"] = [ndim, ndim]
shape_dict[f"h.{i}.attn.c_proj.bias"] = [ndim]
shape_dict[
f"h.{i}.ln_2.weight"] = [ndim]
shape_dict[f"h.{i}.ln_2.bias"] = [ndim]
shape_dict[
f"h.{i}.mlp.c_fc.weight"] = [ndim, 4 * ndim]
shape_dict[f"h.{i}.mlp.c_fc.bias"] = [4 * ndim]
shape_dict[
f"h.{i}.mlp.c_proj.weight"] = [4 * ndim, ndim]
shape_dict[f"h.{i}.mlp.c_proj.bias"] = [ndim]
shape_dict[
return tg.model.Model(shape_dict)
= gpt2_new(n_vocab=n_vocab, n_layers=n_layers, n_heads=n_heads, ndim=ndim) model
t = Tensor(123, requires_grad=False) t1 = t + t
t1.requires_grad is False t1.parents is []
t1.requires_grad(True)
t1.requires_grad is True
But it has no parents!!!1
t1.op should be Load, not Add
def gpt2_init(model):
for k in model.params.keys():
if k.endswith(".weight"):
= Tensor(np.random.randn(*model.params[k].shape), name=k) * 0.02
model.params[k] elif k.endswith(".bias"):
= Tensor(np.zeros(model.params[k].shape), name=k)
model.params[k]
"wte"] = Tensor(np.random.randn(*model.params["wte"].shape), name="wte") * 0.02
model.params["wpe"] = Tensor(np.random.randn(*model.params["wpe"].shape), name="wpe") * 0.01
model.params[
gpt2_init(model)True) model.requires_grad(
= tg.model.Model("model.safetensors") model
tidygrad.tensor._num_tensors
596
import tidygrad.func as F
def gpt2_transformer_block(model: tg.model.Model, x, n_heads, i):
def get_params(s):
return model.params[f"h.{i}.{s}"]
= F.layer_norm(x, get_params("ln_1.weight"), get_params("ln_1.bias"))
ln_1
= get_params("attn.c_attn.weight")
attn_w_qkv = get_params("attn.c_attn.bias")
attn_b_qkv
= attn_w_qkv.split(3, axis=-1)
attn_w_q, attn_w_k, attn_w_v = attn_b_qkv.split(3, axis=-1)
attn_b_q, attn_b_k, attn_b_v
= ln_1.mmul(attn_w_q) + attn_b_q
q = ln_1.mmul(attn_w_k) + attn_b_k
k = ln_1.mmul(attn_w_v) + attn_b_v
v
= F.stack(q.split(n=n_heads, axis=-1), axis=0)
q_chunked = F.stack(k.split(n=n_heads, axis=-1), axis=0)
k_chunked = F.stack(v.split(n=n_heads, axis=-1), axis=0)
v_chunked
= q_chunked.shape[-1]
dim = q_chunked.mmul(k_chunked.transpose(-1, -2)) / np.sqrt(dim / n_heads)
attention
= np.tril(np.ones(attention.shape), k=0)
mask = np.exp(attention) * mask
ee
= ee / ee.sum(axis=-1, keepdims=True)
softmaxed
= softmaxed.mmul(v_chunked)
attention_output = attention_output.split(axis=0, n=n_heads)
attention_chunks # print("attention_chunks", attention_chunks)
= F.concat(attention_chunks, axis=-1)
attention_reshaped = attention_reshaped[0]
attention_reshaped # print("attention_reshaped", attention_reshaped)
= get_params("attn.c_proj.weight")
cproj_w = get_params("attn.c_proj.bias")
cproj_b # attention_reshaped = Tensor(attention_reshaped_np)
= attention_reshaped.mmul(cproj_w) + cproj_b
crosstalk
= crosstalk + x
after_residual # print("after_residual", after_residual)
= get_params("ln_2.weight")
ln2_w = get_params("ln_2.bias")
ln2_b
= F.layer_norm(after_residual, ln2_w, ln2_b)
after_ln2
= get_params("mlp.c_fc.weight")
mlp_c_fc_w = get_params("mlp.c_fc.bias")
mlp_c_fc_b
= after_ln2.mmul(mlp_c_fc_w) + mlp_c_fc_b
after_up # print("after_up", after_up)
= F.gelu(after_up)
after_up_a # print("after_up_a", after_up_a)
= get_params("mlp.c_proj.weight")
mlp_c_proj_w = get_params("mlp.c_proj.bias")
mlp_c_proj_b
= after_up_a.mmul(mlp_c_proj_w) + mlp_c_proj_b
after_down
= after_down + after_residual
output return output
def gpt2(model, input, n_layers, n_heads):
def get_params(s):
return model.params[s]
input = np.array(input)
= F.embedding(get_params("wte"), input)
token_embeddings = F.embedding(get_params("wpe"), np.arange(input.shape[-1]))
position_embeddings
= token_embeddings + position_embeddings
x
# print("first embedding", x)
for i in range(n_layers):
# print("layer", i)
= gpt2_transformer_block(model=model, x=x, n_heads=n_heads, i=i)
x
return F.layer_norm(x, w=get_params("ln_f.weight"), b=get_params("ln_f.bias"))
# res = gpt2(model, np.arange(256).reshape(2, -1), n_layers=n_layers, n_heads=n_heads)
# res.sum().backward()
# from tidygrad.training import one_hot_encode_batch
def one_hot_encode(batch, n_classes):
= batch.shape
batch_size, sequence_length = np.zeros((batch_size, sequence_length, n_classes))
one_hot = np.indices((batch_size, sequence_length))
rows, cols = 1
one_hot[rows, cols, batch] return one_hot
def language_modeling_loss(model, input, target, n_layers, n_heads):
= gpt2(model, input, n_layers, n_heads)
res # print("res", res)
# print("wte", model.params["wte"])
= res.mmul(model.params["wte"].transpose(-1, -2), name="logits")
logits
# print("logits", logits)
= F.CrossEntropy_loss(logits, one_hot_encode(target, n_classes=n_vocab))
loss return loss
# loss = language_modeling_loss(
# model, input=np.random.randint(0, n_vocab, size=(2, ctx_len)), target=np.random.randint(0, n_vocab, size=(2, ctx_len)), n_layers=n_layers, n_heads=n_heads
# )
# print("loss", loss)
# np.seterr(all="raise")
# l = loss.sum()
# print(loss)
# l.backward()
# with open("datasets/TinyStories/TinyStories.txt", "r") as file:
# tokens = file.read()
# Dataset:
# dataset = ["Lilly gsdsgfsdfsd sf sfds"] <- You can no sample from ths
# dataset = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15.....]
# ctx len = 5
# dataset[0] = [1,2,3,4,5]
# dataset[1] = [2,3,4,5,6]
# dataset[2] = [3,4,5,6,7]
# dataset[3] = [4,5,6,7,8]
from tidygrad.utils.datasets import Dataset, DataLoader
= np.load("./datasets/TinyStories/TinyStories_1percent_ids.npy")
tokens
class TSDataset(Dataset):
def __init__(self, token_array, ctx_len):
self.token_array = token_array
self.ctx_len = ctx_len
def __len__(self):
return len(self.token_array) - self.ctx_len - 1
def __getitem__(self, i):
return self.token_array[i:i + self.ctx_len], self.token_array[i + 1:i + self.ctx_len + 1]
def collate_fn(self, batch):
# print("batch", batch) # [(x1, y1), (x2, y2), (x3, y3)]
return np.stack([x for x, y in batch]), np.stack([y for x, y in batch])
= TSDataset(tokens, 2) dataset
import math
class TSDataLoader(DataLoader):
def __init__(self, dataset, batch_size, batch_tfms=None, ctx_len=ctx_len, fake_epoch_len=256, seed=1337):
super().__init__(dataset=dataset, batch_size=batch_size, batch_tfms=batch_tfms)
self.fake_epoch_len = fake_epoch_len
self.ctx_len = ctx_len
self.rng = np.random.default_rng(seed)
def __len__(self):
return min((len(self.dataset) // self.batch_size) // self.ctx_len, self.fake_epoch_len)
def __iter__(self):
self.i = 0
return self
def __next__(self):
if self.i >= min(len(self), self.fake_epoch_len):
raise StopIteration
= self.rng.integers(0, len(self.dataset), size=(self.batch_size, ))
idxs
= [self.dataset[i] for i in idxs]
batch = self.dataset.collate_fn(batch)
batch
self.i += 1
return batch
= TSDataLoader(dataset, batch_size=128) dataloader
from tidygrad.utils.data import DataLoaders
= next(iter(dataloader))
X, y
print("X", X.shape)
print("y", y.shape)
X (128, 2)
y (128, 2)
from tidygrad.training import Learner
from tidygrad.optim import Adam
from functools import partial
import tidygrad.tensor
def loss_function(X, y):
# y = Tensor(y)
= X.mmul(model.params["wte"].transpose(-1, -2), name="logits")
logits
# print("X", X)
# print("y", y)
# print("logits", logits)
= one_hot_encode(y, n_vocab)
one_one_hot
= F.CrossEntropy_loss(logits, one_one_hot, reduction="sum")
loss
print("loss", loss)
= loss.mean()
loss
print("post_epoch num tensors", tidygrad.tensor._num_tensors)
return loss
from tidygrad.training import DictLoggerCallback, ProgressBarCallback, Loss
class OneBatchCallback:
def __init__(self):
self.i = 0
def post_loss(self, learner):
print("post_batch_backward", self.i)
if self.i == 1:
raise Exception("post_batch_backward")
self.i += 1
class MemleakCallback:
def __init__(self):
self.i = 0
print("init")
def post_epoch(self, learner):
print("post_epoch num tensors", tidygrad.tensor._num_tensors)
= partial(gpt2, n_layers=n_layers, n_heads=n_heads)
model_funct
def model_funct(input):
return gpt2(model, input, n_layers=n_layers, n_heads=n_heads)
= Adam(lr=0.001, params=model.parameter_list())
optim
= Learner(
ler =model_funct,
model=DataLoaders(train=dataloader, test=dataloader),
dataloaders=loss_function,
loss_func=optim,
optimizer=[DictLoggerCallback(metrics=[Loss()]),
callbacks=[
ProgressBarCallback(metrics"loss",
=15, plot_smooth_training=5),
], plot_train_skip_ylim
MemleakCallback()], )
init
# ler.fit(epochs=50)
# import json
# json.dump(ler.history, open("history.json", "w"), indent=2)
from pathlib import Path
from tqdm.auto import tqdm
= Path("./datasets/TinyStories/")
TS_PATH
from tokenizers import Tokenizer
= Tokenizer.from_file(str(TS_PATH / "wordpiece_1024.json")) tokenizer
def gpt2_language_modeling(model, input, n_layers, n_heads, temperature=0):
= gpt2(model, input, n_layers, n_heads)
res
= res[:, -1, :]
last_position
# print("wte", model.params["wte"])
= last_position.mmul(model.params["wte"].transpose(-1, -2), name="logits")
logits return logits, logits.data.argmax(axis=-1)
1,2,3,5]], n_layers=n_layers, n_heads=n_heads) gpt2_language_modeling(model, [[
(Tensor[1, 1024](name="" op=Load):
v=array[1, 1024] f32 4Kb x∈[-12.651, 7.492] μ=-3.202 σ=3.977
,
array([16]))
= "Once"
text # text = "<|endoftext|>"
= tokenizer.encode(text).ids # returns a list of integers
tokens print(tokens)
print("=== Generating ===")
print("Input: ", tokenizer.decode(tokens))
with tidygrad.no_grad():
for i in tqdm(range(30)):
= gpt2_language_modeling(model, [tokens], n_layers=n_layers, n_heads=n_heads)
logits, res int(logits.data.argmax(axis=-1)))
tokens.append(del logits, res
# gc.collect()
# print(tokens)
print("Output:", tokenizer.decode(tokens))
[302]
=== Generating ===
Input: Once
Output: Once upon
Output: Once upon a
Output: Once upon a time
Output: Once upon a time ,
Output: Once upon a time , a
Output: Once upon a time , a time
Output: Once upon a time , a time ,
Output: Once upon a time , a time , a
Output: Once upon a time , a time , a time
Output: Once upon a time , a time , a time ,
Output: Once upon a time , a time , a time , there
Output: Once upon a time , a time , a time , there was
Output: Once upon a time , a time , a time , there was a
Output: Once upon a time , a time , a time , there was a little
Output: Once upon a time , a time , a time , there was a little girl
Output: Once upon a time , a time , a time , there was a little girl named
Output: Once upon a time , a time , a time , there was a little girl named Lily
Output: Once upon a time , a time , a time , there was a little girl named Lily .
Output: Once upon a time , a time , a time , there was a little girl named Lily . She
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a little
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a little girl
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a little girl named
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a little girl named Lily
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a little girl named Lily .
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a little girl named Lily . She
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a little girl named Lily . She was
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a little girl named Lily . She was very
Output: Once upon a time , a time , a time , there was a little girl named Lily . She was a little girl named Lily . She was very happy
/tmp/ipykernel_625194/3847700028.py:11: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
tokens.append(int(logits.data.argmax(axis=-1)))