Feat: Added a clean_data to process the data better
Feat: Added the new cleaned datasets
This commit is contained in:
48
clean_data.py
Normal file
48
clean_data.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def clean_data(data):
|
||||||
|
# Split data into lines and filter out metadata
|
||||||
|
lines = data.splitlines()
|
||||||
|
clean_lines = []
|
||||||
|
|
||||||
|
# Regex patterns to identify metadata
|
||||||
|
metadata_patterns = [
|
||||||
|
r"^[0-9a-f]{8}-[0-9a-f]{32}\.txt", # Pattern to identify metadata
|
||||||
|
# lines with .txt file names
|
||||||
|
r"^[0-9]+$", # Pattern to identify lines with only numbers
|
||||||
|
r"^[0-9]{7,8}.*$", # Pattern to identify lines
|
||||||
|
# starting with 7 or 8 digit numbers
|
||||||
|
r"^[^a-zA-Z]*$", # Pattern to identify lines
|
||||||
|
# without alphabetic characters
|
||||||
|
r"^.*ustar.*$", # Pattern to identify lines containing 'ustar'
|
||||||
|
]
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if any(re.match(pattern, line) for pattern in metadata_patterns):
|
||||||
|
continue
|
||||||
|
clean_lines.append(line)
|
||||||
|
|
||||||
|
return "\n".join(clean_lines)
|
||||||
|
|
||||||
|
|
||||||
|
# Load and clean training data
|
||||||
|
with open("train_split.txt", "r", encoding="utf-8") as f:
|
||||||
|
train_data = f.read()
|
||||||
|
train_data_cleaned = clean_data(train_data)
|
||||||
|
|
||||||
|
# Load and clean validation data
|
||||||
|
with open("eval_split.txt", "r", encoding="utf-8") as f:
|
||||||
|
val_data = f.read()
|
||||||
|
val_data_cleaned = clean_data(val_data)
|
||||||
|
|
||||||
|
# Save cleaned data for inspection (optional)
|
||||||
|
with open("train_split_cleaned.txt", "w", encoding="utf-8") as f:
|
||||||
|
f.write(train_data_cleaned)
|
||||||
|
|
||||||
|
with open("eval_split_cleaned.txt", "w", encoding="utf-8") as f:
|
||||||
|
f.write(val_data_cleaned)
|
||||||
|
|
||||||
|
# Print sample cleaned data
|
||||||
|
print("Sample cleaned training data:", train_data_cleaned[:1000])
|
||||||
|
print("Sample cleaned validation data:", val_data_cleaned[:1000])
|
141562
eval_split_cleaned.txt
Normal file
141562
eval_split_cleaned.txt
Normal file
File diff suppressed because one or more lines are too long
1282730
train_split_cleaned.txt
Normal file
1282730
train_split_cleaned.txt
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user