# Download and extract the TinyStories dataset
# !wget -c https://huggingface.co/datasets/roneneldan/TinyStories/raw/main/TinyStories_all_data.tar.gz -O datasets/TinyStories/TinyStories_all_data.tar.gz
# !cd datasets/TinyStories && tar -xvf TinyStories_all_data.tar.gz && cd ../..
TinyStories dataset pre-processing.
from tqdm import tqdm
import json
import os
from pathlib import Path
import unidecode
= Path("datasets/TinyStories/")
TS_PATH
= []
stories
for file in tqdm(list(sorted(os.listdir(TS_PATH)))):
if file.endswith(".json"):
with open(TS_PATH / file, "r") as f:
= json.load(f)
data for d in data:
= d["story"]
story if not all(ord(c) < 128 for c in story):
= unidecode.unidecode(story)
story
stories.append(story)
# if d["source"] == "GPT-3.5":
# gpt35_stories.append(story)
# elif d["source"] == "GPT-4":
# gpt4_stories.append(story)
# with open("gpt35_stories.txt", "w") as f:
# f.write("\n".join(gpt35_stories))
# with open("gpt4_stories.txt", "w") as f:
# f.write("\n".join(gpt4_stories))
100%|██████████| 51/51 [03:08<00:00, 3.69s/it]
with open(TS_PATH / "TinyStories.txt", "w") as f:
"\n".join(stories)) f.write(
:
'\t\n !"$%&\'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]`abcdefghijklmnopqrstuvwxyz|~'