from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[SEP]", "[PAD]"], vocab_size=1024)
tokenizer.pre_tokenizer = Whitespace()
from pathlib import Path
TS_PATH = Path("datasets/TinyStories/")# res = tokenizer.train([ str(TS_PATH / "TinyStories.txt") ], trainer)
# tokenizer.save(str(TS_PATH / "wordpiece_1024.json"))tokenizer = Tokenizer.from_file(str(TS_PATH / "wordpiece_1024.json"))token_ids = tokenizer.encode("Hello, y'all! How are you?").ids
print(token_ids)
print(tokenizer.decode(token_ids))[247, 988, 14, 90, 9, 346, 3, 42, 235, 430, 264, 33]
He ##llo , y ' all ! H ##ow are you ?
with open("./datasets/TinyStories/TinyStories_1percent.txt") as f:
text = f.read()tokenized_text = tokenizer.encode(text).idstokenized_text[:10][227, 193, 442, 430, 324, 16, 250, 449, 191, 242]
import numpy as nptokenized_text_np = np.array(tokenized_text).astype(np.int16)np.save("./datasets/TinyStories/TinyStories_1percent_ids", tokenized_text_np)