Added another learning source for Nora. Also added the requirements.

2025-06-09 14:25:11 -04:00
parent da23742671
commit 5d53ba7cb8
14 changed files with 1070 additions and 78 deletions
--- a/data_prep.py
+++ b/data_prep.py
@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""
+data_prep.py
+
+1) Attempts to download Cornell Movie-Dialogs via ConvoKit (key: "movie-corpus").
+   - If ConvoKit/Unsloth fails, falls back to manual ZIP download/extraction.
+
+2) Attempts to download PersonaChat via Hugging Face Datasets:
+   - First tries "persona_chat" (older key).
+   - If that fails, tries "conv_ai_2" (alias).
+   - Catches any exception to skip gracefully.
+
+3) Writes each utterance to:
+     data/conversational/cornell_movie_dialogs.txt
+     data/conversational/persona_chat.txt
+
+After running, you’ll have:
+    data/
+    ├── books/                     (your original Gutenberg .txt files)
+    └── conversational/
+        ├── cornell_movie_dialogs.txt
+        └── persona_chat.txt
+
+Then retrain or fine-tune Nora on data/books/ + data/conversational/.
+"""
+
+import os
+import sys
+import zipfile
+import tempfile
+import urllib.request
+from pathlib import Path
+
+# === 1) Attempt to import ConvoKit for Cornell Movie-Dialogs ===
+USE_CONVOKIT = True
+try:
+    from convokit import Corpus, download as convokit_download
+except ImportError:
+    USE_CONVOKIT = False
+
+# === 2) Attempt to import Hugging Face Datasets ===
+HAS_DATASETS = True
+try:
+    from datasets import load_dataset
+except ImportError:
+    HAS_DATASETS = False
+
+# Directory for conversational data
+CONV_DIR = Path("data/conversational")
+CONV_DIR.mkdir(parents=True, exist_ok=True)
+
+# Official ZIP URL (fallback) for Cornell Movie-Dialogs
+CORNELL_ZIP_URL = "https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
+
+
+def install_package(pkg_name: str):
+    """
+    Installs a Python package using the same Python interpreter,
+    wrapping the path in quotes to handle spaces.
+    """
+    python_executable = sys.executable
+    command = f"\"{python_executable}\" -m pip install {pkg_name}"
+    print(f"[data_prep] Installing package: {pkg_name}")
+    os.system(command)
+
+
+def prepare_cornell_via_convokit(output_path: str) -> bool:
+    """
+    Try to download Cornell Movie-Dialogs via ConvoKit (key: "movie-corpus").
+    Returns True if successful, False otherwise.
+    """
+    if not USE_CONVOKIT:
+        print("[data_prep] ConvoKit not installed; skipping ConvoKit path.")
+        return False
+
+    print("[data_prep] Attempting to download Cornell Movie-Dialogs via ConvoKit...")
+    try:
+        corpus = Corpus(filename=convokit_download("movie-corpus"))
+        with open(output_path, "w", encoding="utf-8") as fout:
+            for utt in corpus.iter_utterances():
+                text = utt.text.strip()
+                if text:
+                    fout.write(text.replace("\n", " ") + "\n")
+        print(f"[data_prep] Wrote Cornell Movie-Dialogs to {output_path} (via ConvoKit).")
+        return True
+
+    except NotImplementedError as nie:
+        # Typically due to Unsloth error if GPU unsupported
+        print("[data_prep] ConvoKit raised NotImplementedError (Unsloth/GPU issue).")
+        print(f"[data_prep] Error: {nie}")
+        return False
+
+    except Exception as e:
+        print("[data_prep] ConvoKit path failed with exception:", file=sys.stderr)
+        print(e, file=sys.stderr)
+        return False
+
+
+def prepare_cornell_manual(output_path: str):
+    """
+    Fallback: Download Cornell ZIP manually, extract movie_lines.txt,
+    and write all utterances to output_path.
+    """
+    print("[data_prep] Falling back to manual download of Cornell Movie-Dialogs...")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        zip_path = os.path.join(tmpdir, "cornell.zip")
+        try:
+            print(f"[data_prep] Downloading from {CORNELL_ZIP_URL} ...")
+            urllib.request.urlretrieve(CORNELL_ZIP_URL, zip_path)
+        except Exception as e:
+            print(f"[data_prep] Error downloading Cornell corpus: {e}", file=sys.stderr)
+            return
+
+        try:
+            with zipfile.ZipFile(zip_path, "r") as z:
+                member_name = None
+                for name in z.namelist():
+                    if name.endswith("movie_lines.txt"):
+                        member_name = name
+                        break
+                if member_name is None:
+                    print("[data_prep] movie_lines.txt not found in ZIP.", file=sys.stderr)
+                    return
+                z.extract(member_name, path=tmpdir)
+                extracted_path = os.path.join(tmpdir, member_name)
+        except Exception as e:
+            print(f"[data_prep] Error extracting ZIP: {e}", file=sys.stderr)
+            return
+
+        try:
+            with open(extracted_path, "r", encoding="iso-8859-1", errors="ignore") as fin, open(
+                output_path, "w", encoding="utf-8"
+            ) as fout:
+                for line in fin:
+                    parts = line.split(" +++$+++ ")
+                    if len(parts) == 5:
+                        text = parts[-1].strip()
+                        if text:
+                            fout.write(text.replace("\n", " ") + "\n")
+        except Exception as e:
+            print(f"[data_prep] Error parsing movie_lines.txt: {e}", file=sys.stderr)
+            return
+
+    print(f"[data_prep] Wrote Cornell Movie-Dialogs to {output_path} (manual).")
+
+
+def prepare_personachat(output_path: str):
+    """
+    Attempt to download PersonaChat via Hugging Face Datasets.
+    Tries "persona_chat" and then "conv_ai_2". Catches any exception.
+    """
+    if not HAS_DATASETS:
+        install_package("datasets")
+        global load_dataset
+        from datasets import load_dataset
+        # Now we have it
+    for dataset_key in ["persona_chat", "conv_ai_2"]:
+        try:
+            print(f"[data_prep] Attempting to load '{dataset_key}' via Hugging Face Datasets...")
+            if dataset_key == "conv_ai_2":
+                dataset = load_dataset(dataset_key, trust_remote_code=True)
+            else:
+                dataset = load_dataset(dataset_key)
+            print(f"[data_prep] Successfully loaded '{dataset_key}'. Writing to {output_path}...")
+            with open(output_path, "w", encoding="utf-8") as fout:
+                if dataset_key == "persona_chat":
+                    for split in ["train", "valid"]:
+                        for conv in dataset[split]:
+                            for line in conv["dialog"]:
+                                text = line.strip()
+                                if text:
+                                    fout.write(text.replace("\n", " ") + "\n")
+                else:  # conv_ai_2
+                    for split in ["train", "valid"]:
+                        for item in dataset[split]:
+                            # conv_ai_2 has a field named "dialog"
+                            if "dialog" in item:
+                                for line in item["dialog"]:
+                                    text = line.strip()
+                                    if text:
+                                        fout.write(text.replace("\n", " ") + "\n")
+                            elif "utterance" in item:
+                                text = item["utterance"].strip()
+                                if text:
+                                    fout.write(text.replace("\n", " ") + "\n")
+            print(f"[data_prep] Finished writing PersonaChat ({dataset_key}) to {output_path}.")
+            return
+        except Exception as e:
+            print(f"[data_prep] Failed '{dataset_key}': {e}", file=sys.stderr)
+            # Try next key
+
+    print("[data_prep] Could not load PersonaChat under any key. Skipping PersonaChat.", file=sys.stderr)
+
+
+def main():
+    cornell_path = CONV_DIR / "cornell_movie_dialogs.txt"
+    persona_path = CONV_DIR / "persona_chat.txt"
+
+    # 1) Prepare Cornell Movie-Dialogs: try ConvoKit, then manual
+    if not cornell_path.is_file():
+        ok = prepare_cornell_via_convokit(str(cornell_path))
+        if not ok:
+            prepare_cornell_manual(str(cornell_path))
+    else:
+        print(f"[data_prep] Skipping Cornell: '{cornell_path}' already exists.")
+
+    # 2) Prepare PersonaChat
+    if not persona_path.is_file():
+        prepare_personachat(str(persona_path))
+    else:
+        print(f"[data_prep] Skipping PersonaChat: '{persona_path}' already exists.")
+
+    print("[data_prep] All done. You can now include data/conversational/ in Nora's training.")
+
+
+if __name__ == "__main__":
+    main()