Feat: Added a clean_data to process the data better

Feat: Added the new cleaned datasets
2024-05-17 14:15:44 -04:00
parent fb8db8a870
commit 763514e815
3 changed files with 1424340 additions and 0 deletions
--- a/clean_data.py
+++ b/clean_data.py
@ -0,0 +1,48 @@
+import re
+
+
+def clean_data(data):
+    # Split data into lines and filter out metadata
+    lines = data.splitlines()
+    clean_lines = []
+
+    # Regex patterns to identify metadata
+    metadata_patterns = [
+        r"^[0-9a-f]{8}-[0-9a-f]{32}\.txt",  # Pattern to identify metadata
+        # lines with .txt file names
+        r"^[0-9]+$",  # Pattern to identify lines with only numbers
+        r"^[0-9]{7,8}.*$",  # Pattern to identify lines
+        # starting with 7 or 8 digit numbers
+        r"^[^a-zA-Z]*$",  # Pattern to identify lines
+        # without alphabetic characters
+        r"^.*ustar.*$",  # Pattern to identify lines containing 'ustar'
+    ]
+
+    for line in lines:
+        if any(re.match(pattern, line) for pattern in metadata_patterns):
+            continue
+        clean_lines.append(line)
+
+    return "\n".join(clean_lines)
+
+
+# Load and clean training data
+with open("train_split.txt", "r", encoding="utf-8") as f:
+    train_data = f.read()
+train_data_cleaned = clean_data(train_data)
+
+# Load and clean validation data
+with open("eval_split.txt", "r", encoding="utf-8") as f:
+    val_data = f.read()
+val_data_cleaned = clean_data(val_data)
+
+# Save cleaned data for inspection (optional)
+with open("train_split_cleaned.txt", "w", encoding="utf-8") as f:
+    f.write(train_data_cleaned)
+
+with open("eval_split_cleaned.txt", "w", encoding="utf-8") as f:
+    f.write(val_data_cleaned)
+
+# Print sample cleaned data
+print("Sample cleaned training data:", train_data_cleaned[:1000])
+print("Sample cleaned validation data:", val_data_cleaned[:1000])