Files
Nora/data_prep.py

218 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
data_prep.py
1) Attempts to download Cornell Movie-Dialogs via ConvoKit (key: "movie-corpus").
- If ConvoKit/Unsloth fails, falls back to manual ZIP download/extraction.
2) Attempts to download PersonaChat via Hugging Face Datasets:
- First tries "persona_chat" (older key).
- If that fails, tries "conv_ai_2" (alias).
- Catches any exception to skip gracefully.
3) Writes each utterance to:
data/conversational/cornell_movie_dialogs.txt
data/conversational/persona_chat.txt
After running, youll have:
data/
├── books/ (your original Gutenberg .txt files)
└── conversational/
├── cornell_movie_dialogs.txt
└── persona_chat.txt
Then retrain or fine-tune Nora on data/books/ + data/conversational/.
"""
import os
import sys
import zipfile
import tempfile
import urllib.request
from pathlib import Path
# === 1) Attempt to import ConvoKit for Cornell Movie-Dialogs ===
USE_CONVOKIT = True
try:
from convokit import Corpus, download as convokit_download
except ImportError:
USE_CONVOKIT = False
# === 2) Attempt to import Hugging Face Datasets ===
HAS_DATASETS = True
try:
from datasets import load_dataset
except ImportError:
HAS_DATASETS = False
# Directory for conversational data
CONV_DIR = Path("data/conversational")
CONV_DIR.mkdir(parents=True, exist_ok=True)
# Official ZIP URL (fallback) for Cornell Movie-Dialogs
CORNELL_ZIP_URL = "https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
def install_package(pkg_name: str):
"""
Installs a Python package using the same Python interpreter,
wrapping the path in quotes to handle spaces.
"""
python_executable = sys.executable
command = f"\"{python_executable}\" -m pip install {pkg_name}"
print(f"[data_prep] Installing package: {pkg_name}")
os.system(command)
def prepare_cornell_via_convokit(output_path: str) -> bool:
"""
Try to download Cornell Movie-Dialogs via ConvoKit (key: "movie-corpus").
Returns True if successful, False otherwise.
"""
if not USE_CONVOKIT:
print("[data_prep] ConvoKit not installed; skipping ConvoKit path.")
return False
print("[data_prep] Attempting to download Cornell Movie-Dialogs via ConvoKit...")
try:
corpus = Corpus(filename=convokit_download("movie-corpus"))
with open(output_path, "w", encoding="utf-8") as fout:
for utt in corpus.iter_utterances():
text = utt.text.strip()
if text:
fout.write(text.replace("\n", " ") + "\n")
print(f"[data_prep] Wrote Cornell Movie-Dialogs to {output_path} (via ConvoKit).")
return True
except NotImplementedError as nie:
# Typically due to Unsloth error if GPU unsupported
print("[data_prep] ConvoKit raised NotImplementedError (Unsloth/GPU issue).")
print(f"[data_prep] Error: {nie}")
return False
except Exception as e:
print("[data_prep] ConvoKit path failed with exception:", file=sys.stderr)
print(e, file=sys.stderr)
return False
def prepare_cornell_manual(output_path: str):
"""
Fallback: Download Cornell ZIP manually, extract movie_lines.txt,
and write all utterances to output_path.
"""
print("[data_prep] Falling back to manual download of Cornell Movie-Dialogs...")
with tempfile.TemporaryDirectory() as tmpdir:
zip_path = os.path.join(tmpdir, "cornell.zip")
try:
print(f"[data_prep] Downloading from {CORNELL_ZIP_URL} ...")
urllib.request.urlretrieve(CORNELL_ZIP_URL, zip_path)
except Exception as e:
print(f"[data_prep] Error downloading Cornell corpus: {e}", file=sys.stderr)
return
try:
with zipfile.ZipFile(zip_path, "r") as z:
member_name = None
for name in z.namelist():
if name.endswith("movie_lines.txt"):
member_name = name
break
if member_name is None:
print("[data_prep] movie_lines.txt not found in ZIP.", file=sys.stderr)
return
z.extract(member_name, path=tmpdir)
extracted_path = os.path.join(tmpdir, member_name)
except Exception as e:
print(f"[data_prep] Error extracting ZIP: {e}", file=sys.stderr)
return
try:
with open(extracted_path, "r", encoding="iso-8859-1", errors="ignore") as fin, open(
output_path, "w", encoding="utf-8"
) as fout:
for line in fin:
parts = line.split(" +++$+++ ")
if len(parts) == 5:
text = parts[-1].strip()
if text:
fout.write(text.replace("\n", " ") + "\n")
except Exception as e:
print(f"[data_prep] Error parsing movie_lines.txt: {e}", file=sys.stderr)
return
print(f"[data_prep] Wrote Cornell Movie-Dialogs to {output_path} (manual).")
def prepare_personachat(output_path: str):
"""
Attempt to download PersonaChat via Hugging Face Datasets.
Tries "persona_chat" and then "conv_ai_2". Catches any exception.
"""
if not HAS_DATASETS:
install_package("datasets")
global load_dataset
from datasets import load_dataset
# Now we have it
for dataset_key in ["persona_chat", "conv_ai_2"]:
try:
print(f"[data_prep] Attempting to load '{dataset_key}' via Hugging Face Datasets...")
if dataset_key == "conv_ai_2":
dataset = load_dataset(dataset_key, trust_remote_code=True)
else:
dataset = load_dataset(dataset_key)
print(f"[data_prep] Successfully loaded '{dataset_key}'. Writing to {output_path}...")
with open(output_path, "w", encoding="utf-8") as fout:
if dataset_key == "persona_chat":
for split in ["train", "valid"]:
for conv in dataset[split]:
for line in conv["dialog"]:
text = line.strip()
if text:
fout.write(text.replace("\n", " ") + "\n")
else: # conv_ai_2
for split in ["train", "valid"]:
for item in dataset[split]:
# conv_ai_2 has a field named "dialog"
if "dialog" in item:
for line in item["dialog"]:
text = line.strip()
if text:
fout.write(text.replace("\n", " ") + "\n")
elif "utterance" in item:
text = item["utterance"].strip()
if text:
fout.write(text.replace("\n", " ") + "\n")
print(f"[data_prep] Finished writing PersonaChat ({dataset_key}) to {output_path}.")
return
except Exception as e:
print(f"[data_prep] Failed '{dataset_key}': {e}", file=sys.stderr)
# Try next key
print("[data_prep] Could not load PersonaChat under any key. Skipping PersonaChat.", file=sys.stderr)
def main():
cornell_path = CONV_DIR / "cornell_movie_dialogs.txt"
persona_path = CONV_DIR / "persona_chat.txt"
# 1) Prepare Cornell Movie-Dialogs: try ConvoKit, then manual
if not cornell_path.is_file():
ok = prepare_cornell_via_convokit(str(cornell_path))
if not ok:
prepare_cornell_manual(str(cornell_path))
else:
print(f"[data_prep] Skipping Cornell: '{cornell_path}' already exists.")
# 2) Prepare PersonaChat
if not persona_path.is_file():
prepare_personachat(str(persona_path))
else:
print(f"[data_prep] Skipping PersonaChat: '{persona_path}' already exists.")
print("[data_prep] All done. You can now include data/conversational/ in Nora's training.")
if __name__ == "__main__":
main()