from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[SEP]", "[PAD]"], vocab_size=1024)

tokenizer.pre_tokenizer = Whitespace()

from pathlib import Path

TS_PATH = Path("datasets/TinyStories/")
# res = tokenizer.train([ str(TS_PATH / "TinyStories.txt") ], trainer)


# tokenizer.save(str(TS_PATH / "wordpiece_1024.json"))
tokenizer = Tokenizer.from_file(str(TS_PATH / "wordpiece_1024.json"))
token_ids = tokenizer.encode("Hello, y'all! How are you?").ids
print(token_ids)

print(tokenizer.decode(token_ids))
[247, 988, 14, 90, 9, 346, 3, 42, 235, 430, 264, 33]
He ##llo , y ' all ! H ##ow are you ?
with open("./datasets/TinyStories/TinyStories_1percent.txt") as f:
    text = f.read()
tokenized_text = tokenizer.encode(text).ids
tokenized_text[:10]
[227, 193, 442, 430, 324, 16, 250, 449, 191, 242]
import numpy as np
tokenized_text_np = np.array(tokenized_text).astype(np.int16)
np.save("./datasets/TinyStories/TinyStories_1percent_ids", tokenized_text_np)