from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
= Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer = WordPieceTrainer(special_tokens=["[UNK]", "[SEP]", "[PAD]"], vocab_size=1024)
trainer
= Whitespace()
tokenizer.pre_tokenizer
from pathlib import Path
= Path("datasets/TinyStories/") TS_PATH
# res = tokenizer.train([ str(TS_PATH / "TinyStories.txt") ], trainer)
# tokenizer.save(str(TS_PATH / "wordpiece_1024.json"))
= Tokenizer.from_file(str(TS_PATH / "wordpiece_1024.json")) tokenizer
= tokenizer.encode("Hello, y'all! How are you?").ids
token_ids print(token_ids)
print(tokenizer.decode(token_ids))
[247, 988, 14, 90, 9, 346, 3, 42, 235, 430, 264, 33]
He ##llo , y ' all ! H ##ow are you ?
with open("./datasets/TinyStories/TinyStories_1percent.txt") as f:
= f.read() text
= tokenizer.encode(text).ids tokenized_text
10] tokenized_text[:
[227, 193, 442, 430, 324, 16, 250, 449, 191, 242]
import numpy as np
= np.array(tokenized_text).astype(np.int16) tokenized_text_np
"./datasets/TinyStories/TinyStories_1percent_ids", tokenized_text_np) np.save(