NOVA/nova_data/legal_sources.py

"""
Legal dataset sources and license tracking
"""

from dataclasses import dataclass
from typing import List, Optional
from enum import Enum


class License(Enum):
    """Supported open licenses"""
    PUBLIC_DOMAIN = "public-domain"
    CC0 = "cc0-1.0"
    CC_BY = "cc-by-4.0"
    MIT = "mit"
    APACHE_2 = "apache-2.0"
    BSD = "bsd-3-clause"


@dataclass
class DatasetSource:
    """Definition of a legal dataset source"""
    name: str
    description: str
    license: License
    url: str
    download_function: str  # Name of function to download
    estimated_size_gb: float
    language: str = "en"


class LegalDatasetRegistry:
    """
    Registry of legal, properly licensed datasets for NOVA

    IMPORTANT: Only includes datasets with permissive licenses
    suitable for training language models
    """

    SOURCES = [
        DatasetSource(
            name="wikipedia-en",
            description="English Wikipedia dump (latest)",
            license=License.CC_BY,
            url="https://dumps.wikimedia.org/enwiki/latest/",
            download_function="download_wikipedia",
            estimated_size_gb=20.0,
            language="en"
        ),
        DatasetSource(
            name="project-gutenberg",
            description="Project Gutenberg public domain books",
            license=License.PUBLIC_DOMAIN,
            url="https://www.gutenberg.org/",
            download_function="download_gutenberg",
            estimated_size_gb=15.0,
            language="en"
        ),
        DatasetSource(
            name="openwebtext",
            description="Open reproduction of WebText (Reddit links)",
            license=License.CC0,
            url="https://huggingface.co/datasets/Skylion007/openwebtext",
            download_function="download_openwebtext",
            estimated_size_gb=38.0,
            language="en"
        ),
        DatasetSource(
            name="c4",
            description="Colossal Clean Crawled Corpus (C4)",
            license=License.CC_BY,
            url="https://huggingface.co/datasets/c4",
            download_function="download_c4",
            estimated_size_gb=300.0,
            language="en"
        ),
        DatasetSource(
            name="the-pile-arxiv",
            description="ArXiv papers from The Pile",
            license=License.MIT,
            url="https://pile.eleuther.ai/",
            download_function="download_pile_arxiv",
            estimated_size_gb=60.0,
            language="en"
        ),
    ]

    @classmethod
    def list_sources(cls) -> List[DatasetSource]:
        """List all available legal sources"""
        return cls.SOURCES

    @classmethod
    def get_source(cls, name: str) -> Optional[DatasetSource]:
        """Get source by name"""
        for source in cls.SOURCES:
            if source.name == name:
                return source
        return None

    @classmethod
    def filter_by_license(cls, license: License) -> List[DatasetSource]:
        """Filter sources by license"""
        return [s for s in cls.SOURCES if s.license == license]

    @classmethod
    def filter_by_size(cls, max_size_gb: float) -> List[DatasetSource]:
        """Filter sources by size"""
        return [s for s in cls.SOURCES if s.estimated_size_gb <= max_size_gb]