NOVA/nova_data/pipeline.py

"""
Data pipeline for legal dataset acquisition and processing
"""

import json
from pathlib import Path
from typing import List, Dict, Optional
from tqdm import tqdm
import hashlib

from .legal_sources import LegalDatasetRegistry, DatasetSource


class DataPipeline:
    """
    Legal-only data acquisition and processing pipeline

    Features:
    - License tracking and verification
    - Provenance recording
    - Deduplication
    - Text cleaning
    """

    def __init__(self, output_dir: str = "data/processed"):
        """
        Args:
            output_dir: Directory for processed data
        """
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        # License ledger
        self.ledger_path = self.output_dir / "license_ledger.json"
        self.ledger = self._load_ledger()

    def _load_ledger(self) -> Dict:
        """Load license ledger"""
        if self.ledger_path.exists():
            with open(self.ledger_path, 'r') as f:
                return json.load(f)
        return {'sources': [], 'shards': []}

    def _save_ledger(self):
        """Save license ledger"""
        with open(self.ledger_path, 'w') as f:
            json.dump(self.ledger, f, indent=2)

    def download_source(self, source_name: str, dry_run: bool = False):
        """
        Download a legal dataset source

        Args:
            source_name: Name of source from registry
            dry_run: If True, don't actually download (just show info)
        """
        source = LegalDatasetRegistry.get_source(source_name)

        if not source:
            raise ValueError(f"Unknown source: {source_name}")

        print(f"Source: {source.name}")
        print(f"Description: {source.description}")
        print(f"License: {source.license.value}")
        print(f"Estimated size: {source.estimated_size_gb} GB")

        if dry_run:
            print("\n[DRY RUN] Would download from:", source.url)
            return

        print("\nDownloading...")
        # TODO: Implement actual download logic for each source
        # For now, this is a placeholder

        # Record in ledger
        self.ledger['sources'].append({
            'name': source.name,
            'license': source.license.value,
            'url': source.url,
            'download_date': str(Path.ctime(self.output_dir)),
        })

        self._save_ledger()
        print("✓ Download complete and recorded in ledger")

    def create_toy_dataset(self):
        """
        Create a tiny toy dataset for offline e2e demo

        This is a minimal legal dataset for testing without downloads
        """
        toy_data_path = Path("data/toy_dataset/toy.txt")
        toy_data_path.parent.mkdir(parents=True, exist_ok=True)

        # Public domain sample texts
        sample_texts = [
            "The quick brown fox jumps over the lazy dog.",
            "To be or not to be, that is the question.",
            "In the beginning was the Word.",
            "It was the best of times, it was the worst of times.",
            "Call me Ishmael.",
            "All happy families are alike.",
            "It is a truth universally acknowledged.",
            "The past is a foreign country; they do things differently there.",
            "Once upon a time in a land far away.",
            "The sun rose over the horizon, painting the sky in shades of gold.",
        ] * 100  # Repeat for more data

        with open(toy_data_path, 'w', encoding='utf-8') as f:
            for text in sample_texts:
                f.write(text + '\n')

        print(f"✓ Toy dataset created: {toy_data_path}")

        # Record in ledger
        self.ledger['sources'].append({
            'name': 'toy-dataset',
            'license': 'public-domain',
            'description': 'Minimal toy dataset for testing',
            'created': 'generated',
        })

        self._save_ledger()

        return str(toy_data_path)

    def verify_licenses(self) -> bool:
        """
        Verify all data sources have proper licenses

        Returns:
            True if all sources are properly licensed
        """
        print("Verifying licenses...")

        all_valid = True

        for source_entry in self.ledger['sources']:
            name = source_entry.get('name')
            license_str = source_entry.get('license')

            print(f"  {name}: {license_str}")

            # Check if license is in our approved list
            valid_licenses = [lic.value for lic in LegalDatasetRegistry.License]
            if license_str not in valid_licenses and license_str != 'public-domain':
                print(f"    ⚠️ WARNING: Unrecognized license!")
                all_valid = False

        if all_valid:
            print("\n✓ All sources properly licensed")
        else:
            print("\n⚠️ Some sources have unverified licenses")

        return all_valid

    def show_ledger(self):
        """Print license ledger"""
        print("\nLicense Ledger:")
        print("=" * 60)

        print(f"\nSources ({len(self.ledger['sources'])}):")
        for source in self.ledger['sources']:
            print(f"  - {source['name']}: {source['license']}")

        print(f"\nShards ({len(self.ledger['shards'])}):")
        for shard in self.ledger.get('shards', []):
            print(f"  - {shard['name']}")