import re def clean_data(data): # Split data into lines and filter out metadata lines = data.splitlines() clean_lines = [] # Regex patterns to identify metadata metadata_patterns = [ r"^[0-9a-f]{8}-[0-9a-f]{32}\.txt", # Pattern to identify metadata # lines with .txt file names r"^[0-9]+$", # Pattern to identify lines with only numbers r"^[0-9]{7,8}.*$", # Pattern to identify lines # starting with 7 or 8 digit numbers r"^[^a-zA-Z]*$", # Pattern to identify lines # without alphabetic characters r"^.*ustar.*$", # Pattern to identify lines containing 'ustar' ] for line in lines: if any(re.match(pattern, line) for pattern in metadata_patterns): continue clean_lines.append(line) return "\n".join(clean_lines) # Load and clean training data with open("train_split.txt", "r", encoding="utf-8") as f: train_data = f.read() train_data_cleaned = clean_data(train_data) # Load and clean validation data with open("eval_split.txt", "r", encoding="utf-8") as f: val_data = f.read() val_data_cleaned = clean_data(val_data) # Save cleaned data for inspection (optional) with open("train_split_cleaned.txt", "w", encoding="utf-8") as f: f.write(train_data_cleaned) with open("eval_split_cleaned.txt", "w", encoding="utf-8") as f: f.write(val_data_cleaned) # Print sample cleaned data print("Sample cleaned training data:", train_data_cleaned[:1000]) print("Sample cleaned validation data:", val_data_cleaned[:1000])