# combine_and_clean.py import os import re import random from tqdm import tqdm import concurrent.futures import multiprocessing def clean_data(data): lines = data.splitlines() clean_lines = [] metadata_patterns = [ r"^[0-9a-f]{8}-[0-9a-f]{32}\.txt", r"^[0-9]+$", r"^[0-9]{7,8}.*$", r"^[^a-zA-Z]*$", r"^.*ustar.*$", ] for line in lines: if any(re.match(pattern, line) for pattern in metadata_patterns): continue clean_lines.append(line) return "\n".join(clean_lines) def process_file(args): directory, filename, output_file = args file_path = os.path.join(directory, filename) with open(file_path, "rt", encoding="utf-8") as infile: text = infile.read() with open(output_file, "a", encoding="utf-8") as outfile: outfile.write(text) characters = set(text) return characters def files_in_dir(directory): return [ filename for filename in os.listdir(directory) if os.path.isfile(os.path.join(directory, filename)) ] def process_files_in_parallel(files, folder_path, output_file): vocab = set() with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: args = [(folder_path, filename, output_file) for filename in files] for characters in tqdm( executor.map(process_file, args), total=len(files) ): vocab.update(characters) return vocab def main(): multiprocessing.freeze_support() dataset_dirs = ["datasets/openwebtext", "datasets/other_dataset"] output_file_train = "combined_train.txt" output_file_val = "combined_eval.txt" vocab_file = "vocab.txt" all_files = [] for dir in dataset_dirs: all_files.extend([(dir, filename) for filename in files_in_dir(dir)]) total_files = len(all_files) split_index = int(total_files * 0.9) files_train = all_files[:split_index] files_val = all_files[split_index:] sample_rate = 0.01 files_train_sampled = random.sample( files_train, max(1, int(len(files_train) * sample_rate)) ) files_val_sampled = random.sample( files_val, max(1, int(len(files_val) * sample_rate)) ) open(output_file_train, "w").close() open(output_file_val, "w").close() vocab_train = process_files_in_parallel( files_train_sampled, dataset_dirs[0], output_file_train ) vocab_val = process_files_in_parallel( files_val_sampled, dataset_dirs[0], output_file_val ) vocab = vocab_train.union(vocab_val) with open(vocab_file, "w", encoding="utf-8") as vfile: for char in sorted(vocab): vfile.write(char + "\n") with open(output_file_train, "r", encoding="utf-8") as f: train_data = f.read() train_data_cleaned = clean_data(train_data) with open("combined_train_cleaned.txt", "w", encoding="utf-8") as f: f.write(train_data_cleaned) with open(output_file_val, "r", encoding="utf-8") as f: val_data = f.read() val_data_cleaned = clean_data(val_data) with open("combined_eval_cleaned.txt", "w", encoding="utf-8") as f: f.write(val_data_cleaned) if __name__ == "__main__": main()