Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
110 lines
3.2 KiB
Python
110 lines
3.2 KiB
Python
"""
|
|
Legal dataset sources and license tracking
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional
|
|
from enum import Enum
|
|
|
|
|
|
class License(Enum):
|
|
"""Supported open licenses"""
|
|
PUBLIC_DOMAIN = "public-domain"
|
|
CC0 = "cc0-1.0"
|
|
CC_BY = "cc-by-4.0"
|
|
MIT = "mit"
|
|
APACHE_2 = "apache-2.0"
|
|
BSD = "bsd-3-clause"
|
|
|
|
|
|
@dataclass
|
|
class DatasetSource:
|
|
"""Definition of a legal dataset source"""
|
|
name: str
|
|
description: str
|
|
license: License
|
|
url: str
|
|
download_function: str # Name of function to download
|
|
estimated_size_gb: float
|
|
language: str = "en"
|
|
|
|
|
|
class LegalDatasetRegistry:
|
|
"""
|
|
Registry of legal, properly licensed datasets for NOVA
|
|
|
|
IMPORTANT: Only includes datasets with permissive licenses
|
|
suitable for training language models
|
|
"""
|
|
|
|
SOURCES = [
|
|
DatasetSource(
|
|
name="wikipedia-en",
|
|
description="English Wikipedia dump (latest)",
|
|
license=License.CC_BY,
|
|
url="https://dumps.wikimedia.org/enwiki/latest/",
|
|
download_function="download_wikipedia",
|
|
estimated_size_gb=20.0,
|
|
language="en"
|
|
),
|
|
DatasetSource(
|
|
name="project-gutenberg",
|
|
description="Project Gutenberg public domain books",
|
|
license=License.PUBLIC_DOMAIN,
|
|
url="https://www.gutenberg.org/",
|
|
download_function="download_gutenberg",
|
|
estimated_size_gb=15.0,
|
|
language="en"
|
|
),
|
|
DatasetSource(
|
|
name="openwebtext",
|
|
description="Open reproduction of WebText (Reddit links)",
|
|
license=License.CC0,
|
|
url="https://huggingface.co/datasets/Skylion007/openwebtext",
|
|
download_function="download_openwebtext",
|
|
estimated_size_gb=38.0,
|
|
language="en"
|
|
),
|
|
DatasetSource(
|
|
name="c4",
|
|
description="Colossal Clean Crawled Corpus (C4)",
|
|
license=License.CC_BY,
|
|
url="https://huggingface.co/datasets/c4",
|
|
download_function="download_c4",
|
|
estimated_size_gb=300.0,
|
|
language="en"
|
|
),
|
|
DatasetSource(
|
|
name="the-pile-arxiv",
|
|
description="ArXiv papers from The Pile",
|
|
license=License.MIT,
|
|
url="https://pile.eleuther.ai/",
|
|
download_function="download_pile_arxiv",
|
|
estimated_size_gb=60.0,
|
|
language="en"
|
|
),
|
|
]
|
|
|
|
@classmethod
|
|
def list_sources(cls) -> List[DatasetSource]:
|
|
"""List all available legal sources"""
|
|
return cls.SOURCES
|
|
|
|
@classmethod
|
|
def get_source(cls, name: str) -> Optional[DatasetSource]:
|
|
"""Get source by name"""
|
|
for source in cls.SOURCES:
|
|
if source.name == name:
|
|
return source
|
|
return None
|
|
|
|
@classmethod
|
|
def filter_by_license(cls, license: License) -> List[DatasetSource]:
|
|
"""Filter sources by license"""
|
|
return [s for s in cls.SOURCES if s.license == license]
|
|
|
|
@classmethod
|
|
def filter_by_size(cls, max_size_gb: float) -> List[DatasetSource]:
|
|
"""Filter sources by size"""
|
|
return [s for s in cls.SOURCES if s.estimated_size_gb <= max_size_gb]
|