""" Legal dataset sources and license tracking """ from dataclasses import dataclass from typing import List, Optional from enum import Enum class License(Enum): """Supported open licenses""" PUBLIC_DOMAIN = "public-domain" CC0 = "cc0-1.0" CC_BY = "cc-by-4.0" MIT = "mit" APACHE_2 = "apache-2.0" BSD = "bsd-3-clause" @dataclass class DatasetSource: """Definition of a legal dataset source""" name: str description: str license: License url: str download_function: str # Name of function to download estimated_size_gb: float language: str = "en" class LegalDatasetRegistry: """ Registry of legal, properly licensed datasets for NOVA IMPORTANT: Only includes datasets with permissive licenses suitable for training language models """ SOURCES = [ DatasetSource( name="wikipedia-en", description="English Wikipedia dump (latest)", license=License.CC_BY, url="https://dumps.wikimedia.org/enwiki/latest/", download_function="download_wikipedia", estimated_size_gb=20.0, language="en" ), DatasetSource( name="project-gutenberg", description="Project Gutenberg public domain books", license=License.PUBLIC_DOMAIN, url="https://www.gutenberg.org/", download_function="download_gutenberg", estimated_size_gb=15.0, language="en" ), DatasetSource( name="openwebtext", description="Open reproduction of WebText (Reddit links)", license=License.CC0, url="https://huggingface.co/datasets/Skylion007/openwebtext", download_function="download_openwebtext", estimated_size_gb=38.0, language="en" ), DatasetSource( name="c4", description="Colossal Clean Crawled Corpus (C4)", license=License.CC_BY, url="https://huggingface.co/datasets/c4", download_function="download_c4", estimated_size_gb=300.0, language="en" ), DatasetSource( name="the-pile-arxiv", description="ArXiv papers from The Pile", license=License.MIT, url="https://pile.eleuther.ai/", download_function="download_pile_arxiv", estimated_size_gb=60.0, language="en" ), ] @classmethod def list_sources(cls) -> List[DatasetSource]: """List all available legal sources""" return cls.SOURCES @classmethod def get_source(cls, name: str) -> Optional[DatasetSource]: """Get source by name""" for source in cls.SOURCES: if source.name == name: return source return None @classmethod def filter_by_license(cls, license: License) -> List[DatasetSource]: """Filter sources by license""" return [s for s in cls.SOURCES if s.license == license] @classmethod def filter_by_size(cls, max_size_gb: float) -> List[DatasetSource]: """Filter sources by size""" return [s for s in cls.SOURCES if s.estimated_size_gb <= max_size_gb]