""" Data pipeline for legal dataset acquisition and processing """ import json from pathlib import Path from typing import List, Dict, Optional from tqdm import tqdm import hashlib from .legal_sources import LegalDatasetRegistry, DatasetSource class DataPipeline: """ Legal-only data acquisition and processing pipeline Features: - License tracking and verification - Provenance recording - Deduplication - Text cleaning """ def __init__(self, output_dir: str = "data/processed"): """ Args: output_dir: Directory for processed data """ self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) # License ledger self.ledger_path = self.output_dir / "license_ledger.json" self.ledger = self._load_ledger() def _load_ledger(self) -> Dict: """Load license ledger""" if self.ledger_path.exists(): with open(self.ledger_path, 'r') as f: return json.load(f) return {'sources': [], 'shards': []} def _save_ledger(self): """Save license ledger""" with open(self.ledger_path, 'w') as f: json.dump(self.ledger, f, indent=2) def download_source(self, source_name: str, dry_run: bool = False): """ Download a legal dataset source Args: source_name: Name of source from registry dry_run: If True, don't actually download (just show info) """ source = LegalDatasetRegistry.get_source(source_name) if not source: raise ValueError(f"Unknown source: {source_name}") print(f"Source: {source.name}") print(f"Description: {source.description}") print(f"License: {source.license.value}") print(f"Estimated size: {source.estimated_size_gb} GB") if dry_run: print("\n[DRY RUN] Would download from:", source.url) return print("\nDownloading...") # TODO: Implement actual download logic for each source # For now, this is a placeholder # Record in ledger self.ledger['sources'].append({ 'name': source.name, 'license': source.license.value, 'url': source.url, 'download_date': str(Path.ctime(self.output_dir)), }) self._save_ledger() print("✓ Download complete and recorded in ledger") def create_toy_dataset(self): """ Create a tiny toy dataset for offline e2e demo This is a minimal legal dataset for testing without downloads """ toy_data_path = Path("data/toy_dataset/toy.txt") toy_data_path.parent.mkdir(parents=True, exist_ok=True) # Public domain sample texts sample_texts = [ "The quick brown fox jumps over the lazy dog.", "To be or not to be, that is the question.", "In the beginning was the Word.", "It was the best of times, it was the worst of times.", "Call me Ishmael.", "All happy families are alike.", "It is a truth universally acknowledged.", "The past is a foreign country; they do things differently there.", "Once upon a time in a land far away.", "The sun rose over the horizon, painting the sky in shades of gold.", ] * 100 # Repeat for more data with open(toy_data_path, 'w', encoding='utf-8') as f: for text in sample_texts: f.write(text + '\n') print(f"✓ Toy dataset created: {toy_data_path}") # Record in ledger self.ledger['sources'].append({ 'name': 'toy-dataset', 'license': 'public-domain', 'description': 'Minimal toy dataset for testing', 'created': 'generated', }) self._save_ledger() return str(toy_data_path) def verify_licenses(self) -> bool: """ Verify all data sources have proper licenses Returns: True if all sources are properly licensed """ print("Verifying licenses...") all_valid = True for source_entry in self.ledger['sources']: name = source_entry.get('name') license_str = source_entry.get('license') print(f" {name}: {license_str}") # Check if license is in our approved list valid_licenses = [lic.value for lic in LegalDatasetRegistry.License] if license_str not in valid_licenses and license_str != 'public-domain': print(f" ⚠️ WARNING: Unrecognized license!") all_valid = False if all_valid: print("\n✓ All sources properly licensed") else: print("\n⚠️ Some sources have unverified licenses") return all_valid def show_ledger(self): """Print license ledger""" print("\nLicense Ledger:") print("=" * 60) print(f"\nSources ({len(self.ledger['sources'])}):") for source in self.ledger['sources']: print(f" - {source['name']}: {source['license']}") print(f"\nShards ({len(self.ledger['shards'])}):") for shard in self.ledger.get('shards', []): print(f" - {shard['name']}")