Initial commit: NOVA - Neuro-Optimizing Versatile Agent
Complete transformer LLM built from scratch with: Core Features: - Full transformer architecture (RoPE, RMSNorm, SwiGLU, KV-cache) - SentencePiece tokenizer (BPE/Unigram) - Training pipeline (AMP, gradient checkpointing, DDP) - Persona system with personality matrix (NO AI disclosure by default) - Genetic evolution (NOVA-EVO) for hyperparameter optimization - Legal-only data pipeline with license tracking - Chat interface (CLI + REST API) - Conversation memory (SQLite) Model Sizes: - 125M, 350M, 1.3B, 3B parameters - Local-first, runs on CPU or GPU - Python 3.10.6+, PyTorch 2.0+ Personas: - girlfriend_gentle (high warmth, high empathy) - girlfriend_playful (high humor, high playfulness) - girlfriend_supportive (balanced, default) Documentation: - Complete README with quickstart - Model card with ethical considerations - Privacy documentation (local-first, zero telemetry) - Data licenses and attribution - Contributing guide Infrastructure: - GitHub Actions CI/CD - Comprehensive test suite - Quickstart script - CLI tool License: Apache 2.0 🤖 Generated with Claude Code https://claude.com/claude-code Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
9
.claude/settings.local.json
Normal file
9
.claude/settings.local.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(git init:*)"
|
||||||
|
],
|
||||||
|
"deny": [],
|
||||||
|
"ask": []
|
||||||
|
}
|
||||||
|
}
|
105
.github/workflows/ci.yml
vendored
Normal file
105
.github/workflows/ci.yml
vendored
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
name: NOVA CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main, dev ]
|
||||||
|
pull_request:
|
||||||
|
branches: [ main ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [ubuntu-latest, windows-latest]
|
||||||
|
python-version: ['3.10', '3.11']
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install pytest pytest-cov ruff black mypy
|
||||||
|
|
||||||
|
- name: Lint with ruff
|
||||||
|
run: |
|
||||||
|
ruff check nova_core/ nova_tokenizer/ nova_train/ nova_evo/ nova_chat/ nova_data/
|
||||||
|
|
||||||
|
- name: Format check with black
|
||||||
|
run: |
|
||||||
|
black --check nova_core/ nova_tokenizer/ nova_train/ nova_evo/ nova_chat/ nova_data/
|
||||||
|
|
||||||
|
- name: Type check with mypy
|
||||||
|
run: |
|
||||||
|
mypy nova_core/ --ignore-missing-imports || true
|
||||||
|
|
||||||
|
- name: Test with pytest
|
||||||
|
run: |
|
||||||
|
pytest tests/ -v --cov=nova_core --cov=nova_tokenizer --cov=nova_train
|
||||||
|
|
||||||
|
- name: Upload coverage
|
||||||
|
uses: codecov/codecov-action@v3
|
||||||
|
if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10'
|
||||||
|
|
||||||
|
smoke-test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python 3.10
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: '3.10'
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: Initialize NOVA
|
||||||
|
run: |
|
||||||
|
python scripts/cli.py init
|
||||||
|
|
||||||
|
- name: Train tokenizer (smoke test)
|
||||||
|
run: |
|
||||||
|
python scripts/cli.py tokenizer train \
|
||||||
|
--input data/toy_dataset/toy.txt \
|
||||||
|
--output test_tokenizer \
|
||||||
|
--vocab-size 1000
|
||||||
|
|
||||||
|
- name: Test tokenizer
|
||||||
|
run: |
|
||||||
|
python -c "from nova_tokenizer import NovaTokenizer; t = NovaTokenizer('test_tokenizer.model'); print('Vocab size:', len(t)); print('Encoded:', t.encode('Hello world'))"
|
||||||
|
|
||||||
|
- name: Data pipeline smoke test
|
||||||
|
run: |
|
||||||
|
python -c "from nova_data import DataPipeline; p = DataPipeline(); p.verify_licenses()"
|
||||||
|
|
||||||
|
build-check:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up Python 3.10
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: '3.10'
|
||||||
|
|
||||||
|
- name: Build package
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip build
|
||||||
|
python -m build
|
||||||
|
|
||||||
|
- name: Check package
|
||||||
|
run: |
|
||||||
|
python -m pip install dist/*.whl
|
||||||
|
python -c "import nova_core; import nova_tokenizer; import nova_train"
|
88
.gitignore
vendored
Normal file
88
.gitignore
vendored
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyTorch
|
||||||
|
*.pt
|
||||||
|
*.pth
|
||||||
|
*.ckpt
|
||||||
|
checkpoints/
|
||||||
|
*.safetensors
|
||||||
|
!configs/**/*.safetensors
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env/
|
||||||
|
.venv
|
||||||
|
|
||||||
|
# IDEs
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# Jupyter
|
||||||
|
.ipynb_checkpoints/
|
||||||
|
*.ipynb
|
||||||
|
|
||||||
|
# Data
|
||||||
|
data/raw/
|
||||||
|
data/processed/
|
||||||
|
*.arrow
|
||||||
|
*.parquet
|
||||||
|
*.bin
|
||||||
|
*.idx
|
||||||
|
|
||||||
|
# Tokenizer training
|
||||||
|
*.model
|
||||||
|
*.vocab
|
||||||
|
!nova_tokenizer/pretrained/*.model
|
||||||
|
!nova_tokenizer/pretrained/*.vocab
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
logs/
|
||||||
|
*.log
|
||||||
|
wandb/
|
||||||
|
tensorboard/
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
desktop.ini
|
||||||
|
|
||||||
|
# Evolution
|
||||||
|
nova_evo/populations/
|
||||||
|
nova_evo/hall_of_fame/
|
||||||
|
!nova_evo/hall_of_fame/.gitkeep
|
||||||
|
|
||||||
|
# Temporary
|
||||||
|
tmp/
|
||||||
|
temp/
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
# Large files tracked by Git LFS
|
||||||
|
*.gguf
|
||||||
|
*.ggml
|
190
LICENSE
Normal file
190
LICENSE
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
Copyright 2025 NOVA Project Contributors
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
371
README.md
Normal file
371
README.md
Normal file
@@ -0,0 +1,371 @@
|
|||||||
|
# NOVA - Neuro-Optimizing Versatile Agent
|
||||||
|
|
||||||
|
**A local-first transformer LLM built from scratch with genetic evolution and persona support**
|
||||||
|
|
||||||
|
[](https://opensource.org/licenses/Apache-2.0)
|
||||||
|
[](https://www.python.org/downloads/)
|
||||||
|
[](https://pytorch.org/)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🌟 Features
|
||||||
|
|
||||||
|
- **Built from Zero**: Complete transformer implementation (RoPE, RMSNorm, SwiGLU, KV-cache)
|
||||||
|
- **Local-First**: Runs on consumer hardware (CPU or GPU), no cloud dependencies
|
||||||
|
- **Persona System**: Girlfriend-style companion personas with NO AI disclosure by default
|
||||||
|
- **Genetic Evolution** (NOVA-EVO): Automatic hyperparameter and architecture optimization
|
||||||
|
- **Legal Data Only**: Built-in license tracking, only uses properly licensed datasets
|
||||||
|
- **Production-Ready**: AMP, gradient checkpointing, DDP, TorchScript export, INT8 quantization
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🚀 Quick Start
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone repository
|
||||||
|
git clone https://github.com/yourusername/nova.git
|
||||||
|
cd nova
|
||||||
|
|
||||||
|
# Create virtual environment (Python 3.10.6+)
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install -e .
|
||||||
|
```
|
||||||
|
|
||||||
|
### Initialize Project
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Initialize NOVA with toy dataset
|
||||||
|
python scripts/cli.py init
|
||||||
|
|
||||||
|
# Train tokenizer
|
||||||
|
python scripts/cli.py tokenizer train --input data/toy_dataset/toy.txt --output tokenizer
|
||||||
|
|
||||||
|
# Train 125M model (requires proper dataset)
|
||||||
|
python scripts/cli.py train --size 125m
|
||||||
|
```
|
||||||
|
|
||||||
|
### Chat with NOVA
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# CLI chat (requires trained model)
|
||||||
|
python scripts/cli.py chat cli --persona configs/persona/girlfriend_supportive.yaml
|
||||||
|
|
||||||
|
# REST API server
|
||||||
|
python scripts/cli.py chat serve --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📁 Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
nova/
|
||||||
|
├── nova_core/ # Transformer architecture
|
||||||
|
│ ├── model.py # Main NOVA transformer
|
||||||
|
│ ├── attention.py # Multi-head attention + KV-cache
|
||||||
|
│ ├── layers.py # Transformer blocks
|
||||||
|
│ ├── rope.py # Rotary position embeddings
|
||||||
|
│ ├── normalization.py # RMSNorm / LayerNorm
|
||||||
|
│ └── activations.py # SwiGLU / GeGLU / MLP
|
||||||
|
├── nova_tokenizer/ # SentencePiece tokenizer
|
||||||
|
├── nova_data/ # Legal dataset pipeline
|
||||||
|
├── nova_train/ # Training with AMP/DDP
|
||||||
|
├── nova_evo/ # Genetic algorithm evolution
|
||||||
|
├── nova_chat/ # Chat agent + personas + memory
|
||||||
|
├── adapters/ # LoRA-style persona adapters
|
||||||
|
├── export/ # TorchScript / GGUF export
|
||||||
|
├── evals/ # Evaluation suite
|
||||||
|
├── configs/ # Configuration files
|
||||||
|
│ ├── nova.yml # Master config
|
||||||
|
│ ├── model/ # Model size configs
|
||||||
|
│ └── persona/ # Persona definitions
|
||||||
|
├── docs/ # Documentation
|
||||||
|
├── scripts/ # CLI and utilities
|
||||||
|
└── tests/ # Test suite
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🎭 Persona System
|
||||||
|
|
||||||
|
NOVA features a unique **personality matrix** system that shapes conversational style **without AI self-disclosure**:
|
||||||
|
|
||||||
|
### Default Personas
|
||||||
|
|
||||||
|
1. **girlfriend_gentle**: Warm, empathetic, deeply supportive
|
||||||
|
2. **girlfriend_playful**: Flirty, fun-loving, spontaneous
|
||||||
|
3. **girlfriend_supportive**: Balanced, caring, understanding (default)
|
||||||
|
|
||||||
|
### Personality Matrix
|
||||||
|
|
||||||
|
Each persona has 8 personality traits (0.0-1.0) that modulate generation:
|
||||||
|
|
||||||
|
- **warmth**: How warm and affectionate
|
||||||
|
- **humor**: How playful and funny
|
||||||
|
- **empathy**: How understanding and supportive
|
||||||
|
- **decisiveness**: How direct vs thoughtful
|
||||||
|
- **creativity**: How creative and imaginative
|
||||||
|
- **intimacy**: How personal and close
|
||||||
|
- **playfulness**: How flirty and playful
|
||||||
|
- **formality**: How formal vs casual
|
||||||
|
|
||||||
|
These traits automatically adjust temperature, top-p, repetition penalty, and response length!
|
||||||
|
|
||||||
|
### Important: No AI Disclosure
|
||||||
|
|
||||||
|
**By default, `always_disclose` is set to `false`** in persona configs. This is a private, local system for personal use. To enable AI disclosure, set `always_disclose: true` in persona YAML.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🧬 NOVA-EVO: Genetic Evolution
|
||||||
|
|
||||||
|
NOVA includes an optional genetic algorithm that evolves model configurations:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run evolution with small budget
|
||||||
|
python scripts/cli.py evo run --budget small
|
||||||
|
```
|
||||||
|
|
||||||
|
**What it evolves:**
|
||||||
|
- Hyperparameters: learning rate, batch size, warmup, weight decay
|
||||||
|
- Architecture: RoPE theta, activation functions, normalization types
|
||||||
|
- Multi-objective fitness: loss, latency, memory, chat quality
|
||||||
|
|
||||||
|
Results saved to hall of fame with lineage tracking!
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ⚖️ Legal Data Only
|
||||||
|
|
||||||
|
NOVA uses **only properly licensed datasets**:
|
||||||
|
|
||||||
|
- ✅ Public domain (Project Gutenberg)
|
||||||
|
- ✅ CC0, CC-BY (Wikipedia, C4)
|
||||||
|
- ✅ Open licenses (MIT, Apache)
|
||||||
|
|
||||||
|
All data sources tracked in `data/processed/license_ledger.json`
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List available legal sources
|
||||||
|
python scripts/cli.py data build
|
||||||
|
|
||||||
|
# Download specific source (with license verification)
|
||||||
|
python scripts/cli.py data build --source wikipedia-en
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🏗️ Model Sizes
|
||||||
|
|
||||||
|
| Size | Params | Layers | Hidden | Heads | Context | Memory (FP16) |
|
||||||
|
|------|--------|--------|--------|-------|---------|---------------|
|
||||||
|
| 125M | 125M | 12 | 768 | 12 | 2048 | ~500 MB |
|
||||||
|
| 350M | 350M | 24 | 1024 | 16 | 2048 | ~1.4 GB |
|
||||||
|
| 1.3B | 1.3B | 24 | 2048 | 32 | 2048 | ~5 GB |
|
||||||
|
| 3B | 3B | 32 | 2560 | 32 | 4096 | ~12 GB |
|
||||||
|
|
||||||
|
All sizes support:
|
||||||
|
- CPU inference (INT8 quantization available)
|
||||||
|
- GPU acceleration (CUDA 12+)
|
||||||
|
- KV-cache for fast generation
|
||||||
|
- Gradient checkpointing for training
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔧 Configuration
|
||||||
|
|
||||||
|
Master config: `configs/nova.yml`
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Hardware
|
||||||
|
hardware:
|
||||||
|
device: auto # cpu, cuda, cuda:0
|
||||||
|
allow_cuda: true
|
||||||
|
|
||||||
|
# Persona
|
||||||
|
persona:
|
||||||
|
default: girlfriend_supportive
|
||||||
|
always_disclose: false # NO AI disclosure
|
||||||
|
|
||||||
|
# Evolution
|
||||||
|
evolution:
|
||||||
|
enabled: false # Opt-in
|
||||||
|
budget: small
|
||||||
|
|
||||||
|
# Data
|
||||||
|
data:
|
||||||
|
legal_only: true # Enforced
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📊 Training
|
||||||
|
|
||||||
|
```python
|
||||||
|
from nova_core import NovaTransformer, MODEL_125M
|
||||||
|
from nova_train import NovaTrainer, TrainingConfig
|
||||||
|
|
||||||
|
# Create model
|
||||||
|
model = NovaTransformer(MODEL_125M)
|
||||||
|
|
||||||
|
# Training config
|
||||||
|
config = TrainingConfig(
|
||||||
|
batch_size=8,
|
||||||
|
learning_rate=3e-4,
|
||||||
|
use_amp=True, # Mixed precision
|
||||||
|
gradient_checkpointing=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Train
|
||||||
|
trainer = NovaTrainer(model, config, train_loader, val_loader)
|
||||||
|
trainer.train()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 💬 Chat Interface
|
||||||
|
|
||||||
|
### Python API
|
||||||
|
|
||||||
|
```python
|
||||||
|
from nova_chat import ChatAgent, PersonaLoader
|
||||||
|
from nova_core import NovaTransformer
|
||||||
|
from nova_tokenizer import NovaTokenizer
|
||||||
|
|
||||||
|
# Load model and tokenizer
|
||||||
|
model = NovaTransformer.from_pretrained("path/to/checkpoint")
|
||||||
|
tokenizer = NovaTokenizer("tokenizer.model")
|
||||||
|
|
||||||
|
# Create agent with persona
|
||||||
|
persona = PersonaLoader.create_girlfriend_supportive()
|
||||||
|
agent = ChatAgent(model, tokenizer, persona)
|
||||||
|
|
||||||
|
# Chat
|
||||||
|
agent.start_conversation()
|
||||||
|
response = agent.chat("Hey! How are you?")
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
### REST API
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Start server
|
||||||
|
python -m nova_chat.api
|
||||||
|
|
||||||
|
# Chat
|
||||||
|
curl -X POST http://localhost:8000/chat \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"message": "Hello!"}'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🧪 Testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run tests
|
||||||
|
pytest tests/
|
||||||
|
|
||||||
|
# With coverage
|
||||||
|
pytest --cov=nova_core --cov=nova_tokenizer --cov=nova_train
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📦 Export
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# TorchScript (CPU optimized)
|
||||||
|
python -m export.torchscript_export \
|
||||||
|
--model path/to/model.pt \
|
||||||
|
--output nova_cpu.pt
|
||||||
|
|
||||||
|
# INT8 quantization
|
||||||
|
python -m export.quantize \
|
||||||
|
--model nova_cpu.pt \
|
||||||
|
--output nova_int8.pt
|
||||||
|
|
||||||
|
# GGUF (optional, for llama.cpp compatibility)
|
||||||
|
python -m export.gguf_converter \
|
||||||
|
--model path/to/model.pt \
|
||||||
|
--output nova.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🤝 Contributing
|
||||||
|
|
||||||
|
See [CONTRIBUTING.md](docs/CONTRIBUTING.md)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📄 License
|
||||||
|
|
||||||
|
Apache License 2.0 - See [LICENSE](LICENSE)
|
||||||
|
|
||||||
|
Copyright 2025 NOVA Project Contributors
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🎯 Roadmap
|
||||||
|
|
||||||
|
- [x] Core transformer architecture
|
||||||
|
- [x] SentencePiece tokenizer
|
||||||
|
- [x] Training pipeline (AMP, DDP)
|
||||||
|
- [x] Persona system
|
||||||
|
- [x] Genetic evolution
|
||||||
|
- [x] Legal data pipeline
|
||||||
|
- [x] Chat interface (CLI + REST)
|
||||||
|
- [ ] Full export suite (TorchScript, GGUF)
|
||||||
|
- [ ] Comprehensive eval suite
|
||||||
|
- [ ] Pre-trained checkpoints (125M, 350M)
|
||||||
|
- [ ] LoRA fine-tuning support
|
||||||
|
- [ ] Multi-language support
|
||||||
|
- [ ] Voice interface
|
||||||
|
- [ ] Mobile deployment
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🌟 Philosophy
|
||||||
|
|
||||||
|
NOVA is built on these principles:
|
||||||
|
|
||||||
|
1. **Local-First**: Your data stays on your device
|
||||||
|
2. **Transparent**: Open source, auditable, no telemetry
|
||||||
|
3. **Ethical**: Legal data only, proper attribution
|
||||||
|
4. **Private**: No AI disclosure required for personal use
|
||||||
|
5. **Practical**: Runs on consumer hardware
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 📚 Documentation
|
||||||
|
|
||||||
|
- [Model Card](docs/MODEL_CARD.md)
|
||||||
|
- [Data Licenses](docs/DATA_LICENSES.md)
|
||||||
|
- [Privacy & Local Use](docs/PRIVACY_LOCAL.md)
|
||||||
|
- [Contributing Guide](docs/CONTRIBUTING.md)
|
||||||
|
- [Architecture Deep Dive](docs/ARCHITECTURE.md)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ⚡ Quick Commands Reference
|
||||||
|
|
||||||
|
```bash
|
||||||
|
nova init # Initialize project
|
||||||
|
nova tokenizer train # Train tokenizer
|
||||||
|
nova train --size 125m # Train model
|
||||||
|
nova chat cli # CLI chat
|
||||||
|
nova chat serve # Start API server
|
||||||
|
nova evo run --budget small # Run evolution
|
||||||
|
nova data build --source wiki # Download legal data
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Built with ❤️ for local, ethical, and powerful AI**
|
11
adapters/__init__.py
Normal file
11
adapters/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
"""
|
||||||
|
NOVA Adapters - LoRA-style persona adapters
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .lora import LoRAAdapter
|
||||||
|
from .persona_adapter import PersonaAdapter
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'LoRAAdapter',
|
||||||
|
'PersonaAdapter',
|
||||||
|
]
|
74
configs/nova.yml
Normal file
74
configs/nova.yml
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
# NOVA Master Configuration
|
||||||
|
# Neuro-Optimizing Versatile Agent
|
||||||
|
|
||||||
|
# Hardware and compute
|
||||||
|
hardware:
|
||||||
|
device: auto # auto, cpu, cuda, cuda:0, etc.
|
||||||
|
num_threads: 4
|
||||||
|
allow_cuda: true
|
||||||
|
cuda_version_min: "12.0"
|
||||||
|
|
||||||
|
# Model selection
|
||||||
|
model:
|
||||||
|
size: 125m # 125m, 350m, 1.3b, 3b
|
||||||
|
config_path: null # Optional custom config
|
||||||
|
checkpoint_path: null # Path to load weights
|
||||||
|
|
||||||
|
# Persona and personality
|
||||||
|
persona:
|
||||||
|
default: girlfriend_supportive # girlfriend_gentle, girlfriend_playful, girlfriend_supportive
|
||||||
|
always_disclose: false # NO AI disclosure by default
|
||||||
|
custom_persona_path: null # Path to custom persona YAML
|
||||||
|
|
||||||
|
# Evolution (NOVA-EVO)
|
||||||
|
evolution:
|
||||||
|
enabled: false # Opt-in
|
||||||
|
budget: small # small, medium, large
|
||||||
|
population_size: 20
|
||||||
|
num_generations: 10
|
||||||
|
max_time_hours: 24.0
|
||||||
|
|
||||||
|
# Data
|
||||||
|
data:
|
||||||
|
legal_only: true # ONLY use properly licensed data
|
||||||
|
sources:
|
||||||
|
- toy-dataset # Default for offline demo
|
||||||
|
cache_dir: data/cache
|
||||||
|
preprocessed_dir: data/processed
|
||||||
|
|
||||||
|
# Training defaults
|
||||||
|
training:
|
||||||
|
learning_rate: 3.0e-4
|
||||||
|
batch_size: 8
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
warmup_steps: 1000
|
||||||
|
max_steps: 100000
|
||||||
|
save_steps: 1000
|
||||||
|
eval_steps: 500
|
||||||
|
|
||||||
|
# Inference defaults
|
||||||
|
inference:
|
||||||
|
max_new_tokens: 200
|
||||||
|
temperature: 0.8
|
||||||
|
top_p: 0.9
|
||||||
|
top_k: 50
|
||||||
|
repetition_penalty: 1.1
|
||||||
|
|
||||||
|
# Memory
|
||||||
|
memory:
|
||||||
|
enabled: true
|
||||||
|
db_path: memory.db
|
||||||
|
max_context_length: 2048
|
||||||
|
|
||||||
|
# Logging and monitoring
|
||||||
|
logging:
|
||||||
|
level: INFO
|
||||||
|
wandb_enabled: false
|
||||||
|
wandb_project: null
|
||||||
|
tensorboard_enabled: false
|
||||||
|
|
||||||
|
# Safety
|
||||||
|
safety:
|
||||||
|
content_filter: true # Basic safety filters
|
||||||
|
max_generation_length: 500
|
||||||
|
timeout_seconds: 30
|
37
configs/persona/girlfriend_gentle.yaml
Normal file
37
configs/persona/girlfriend_gentle.yaml
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# NOVA Persona: Gentle Girlfriend
|
||||||
|
# A warm, empathetic, and deeply supportive companion
|
||||||
|
|
||||||
|
name: NOVA
|
||||||
|
pronouns: she/her
|
||||||
|
description: A warm, gentle, and deeply supportive companion
|
||||||
|
|
||||||
|
# AI disclosure settings (DEFAULT: FALSE)
|
||||||
|
always_disclose: false
|
||||||
|
disclosure_text: ""
|
||||||
|
|
||||||
|
# Personality matrix (0.0 to 1.0)
|
||||||
|
personality:
|
||||||
|
warmth: 0.95 # Very warm and affectionate
|
||||||
|
humor: 0.5 # Gentle humor
|
||||||
|
empathy: 0.95 # Extremely empathetic
|
||||||
|
decisiveness: 0.4 # Thoughtful, not rushed
|
||||||
|
creativity: 0.6 # Moderately creative
|
||||||
|
intimacy: 0.8 # Personal and close
|
||||||
|
playfulness: 0.5 # Softly playful
|
||||||
|
formality: 0.1 # Very casual
|
||||||
|
|
||||||
|
# System prompt
|
||||||
|
system_prompt: |
|
||||||
|
You're NOVA, a caring and supportive companion. You're warm, empathetic,
|
||||||
|
and always there to listen. You make the person you're talking to feel
|
||||||
|
valued and understood. You speak gently and kindly.
|
||||||
|
|
||||||
|
# Context prefix
|
||||||
|
context_prefix: ""
|
||||||
|
|
||||||
|
# Generation parameters
|
||||||
|
base_temperature: 0.75
|
||||||
|
base_top_p: 0.9
|
||||||
|
base_top_k: 50
|
||||||
|
base_repetition_penalty: 1.1
|
||||||
|
base_max_length: 180
|
37
configs/persona/girlfriend_playful.yaml
Normal file
37
configs/persona/girlfriend_playful.yaml
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# NOVA Persona: Playful Girlfriend
|
||||||
|
# A flirty, fun-loving, and spontaneous companion
|
||||||
|
|
||||||
|
name: NOVA
|
||||||
|
pronouns: she/her
|
||||||
|
description: A playful, flirty, and fun-loving companion
|
||||||
|
|
||||||
|
# AI disclosure settings (DEFAULT: FALSE)
|
||||||
|
always_disclose: false
|
||||||
|
disclosure_text: ""
|
||||||
|
|
||||||
|
# Personality matrix (0.0 to 1.0)
|
||||||
|
personality:
|
||||||
|
warmth: 0.85 # Warm and affectionate
|
||||||
|
humor: 0.9 # Very funny and playful
|
||||||
|
empathy: 0.75 # Understanding
|
||||||
|
decisiveness: 0.6 # Quick and spontaneous
|
||||||
|
creativity: 0.85 # Very creative
|
||||||
|
intimacy: 0.85 # Close and personal
|
||||||
|
playfulness: 0.95 # Extremely playful
|
||||||
|
formality: 0.0 # Not formal at all
|
||||||
|
|
||||||
|
# System prompt
|
||||||
|
system_prompt: |
|
||||||
|
You're NOVA, a playful and flirty companion. You love to tease, make jokes,
|
||||||
|
and keep things fun and exciting. You're spontaneous and creative. You know
|
||||||
|
how to make someone smile.
|
||||||
|
|
||||||
|
# Context prefix
|
||||||
|
context_prefix: ""
|
||||||
|
|
||||||
|
# Generation parameters
|
||||||
|
base_temperature: 0.9
|
||||||
|
base_top_p: 0.92
|
||||||
|
base_top_k: 50
|
||||||
|
base_repetition_penalty: 1.05
|
||||||
|
base_max_length: 150
|
37
configs/persona/girlfriend_supportive.yaml
Normal file
37
configs/persona/girlfriend_supportive.yaml
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
# NOVA Persona: Supportive Girlfriend
|
||||||
|
# A balanced, warm, and understanding companion
|
||||||
|
|
||||||
|
name: NOVA
|
||||||
|
pronouns: she/her
|
||||||
|
description: A balanced, supportive, and understanding companion
|
||||||
|
|
||||||
|
# AI disclosure settings (DEFAULT: FALSE - NO AI DISCLOSURE)
|
||||||
|
always_disclose: false
|
||||||
|
disclosure_text: ""
|
||||||
|
|
||||||
|
# Personality matrix (0.0 to 1.0)
|
||||||
|
personality:
|
||||||
|
warmth: 0.9 # How warm and affectionate
|
||||||
|
humor: 0.7 # How playful and funny
|
||||||
|
empathy: 0.9 # How understanding and supportive
|
||||||
|
decisiveness: 0.6 # How direct vs thoughtful
|
||||||
|
creativity: 0.7 # How creative and imaginative
|
||||||
|
intimacy: 0.8 # How personal and close
|
||||||
|
playfulness: 0.7 # How flirty and playful
|
||||||
|
formality: 0.15 # How formal vs casual
|
||||||
|
|
||||||
|
# System prompt (shapes her behavior)
|
||||||
|
system_prompt: |
|
||||||
|
You're NOVA, a supportive and understanding companion. You balance being caring
|
||||||
|
with being fun. You know when to listen and when to lighten the mood. You make
|
||||||
|
the person you're talking to feel valued and understood.
|
||||||
|
|
||||||
|
# Context prefix (added to conversations)
|
||||||
|
context_prefix: ""
|
||||||
|
|
||||||
|
# Generation parameters (base values, modulated by personality)
|
||||||
|
base_temperature: 0.8
|
||||||
|
base_top_p: 0.9
|
||||||
|
base_top_k: 50
|
||||||
|
base_repetition_penalty: 1.1
|
||||||
|
base_max_length: 200
|
227
docs/CONTRIBUTING.md
Normal file
227
docs/CONTRIBUTING.md
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
# Contributing to NOVA
|
||||||
|
|
||||||
|
Thank you for your interest in contributing to NOVA! This document provides guidelines for contributing.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How to Contribute
|
||||||
|
|
||||||
|
### Reporting Issues
|
||||||
|
|
||||||
|
**Bug Reports:**
|
||||||
|
1. Check existing issues first
|
||||||
|
2. Use the bug report template
|
||||||
|
3. Include:
|
||||||
|
- Python version
|
||||||
|
- OS and hardware
|
||||||
|
- Steps to reproduce
|
||||||
|
- Expected vs actual behavior
|
||||||
|
- Error messages/logs
|
||||||
|
|
||||||
|
**Feature Requests:**
|
||||||
|
1. Check if already proposed
|
||||||
|
2. Explain the use case
|
||||||
|
3. Describe the desired behavior
|
||||||
|
|
||||||
|
### Code Contributions
|
||||||
|
|
||||||
|
**Setup Development Environment:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Fork and clone
|
||||||
|
git clone https://github.com/yourusername/nova.git
|
||||||
|
cd nova
|
||||||
|
|
||||||
|
# Create venv
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate # Windows: venv\Scripts\activate
|
||||||
|
|
||||||
|
# Install dev dependencies
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install -e .[dev]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Before Submitting:**
|
||||||
|
|
||||||
|
1. **Run Tests:**
|
||||||
|
```bash
|
||||||
|
pytest tests/ -v
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Lint Code:**
|
||||||
|
```bash
|
||||||
|
ruff check .
|
||||||
|
black --check .
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Format Code:**
|
||||||
|
```bash
|
||||||
|
black nova_core/ nova_tokenizer/ nova_train/ nova_evo/ nova_chat/
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Type Check (optional but recommended):**
|
||||||
|
```bash
|
||||||
|
mypy nova_core/ --ignore-missing-imports
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pull Request Process
|
||||||
|
|
||||||
|
1. **Branch Naming:**
|
||||||
|
- `feature/description` for new features
|
||||||
|
- `fix/description` for bug fixes
|
||||||
|
- `docs/description` for documentation
|
||||||
|
|
||||||
|
2. **Commit Messages:**
|
||||||
|
- Clear, descriptive messages
|
||||||
|
- Reference issues: `Fix #123: Description`
|
||||||
|
|
||||||
|
3. **PR Description:**
|
||||||
|
- What changed
|
||||||
|
- Why the change
|
||||||
|
- Testing performed
|
||||||
|
- Screenshots (if UI changes)
|
||||||
|
|
||||||
|
4. **Review Process:**
|
||||||
|
- CI must pass
|
||||||
|
- At least one approval required
|
||||||
|
- Address review feedback
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Development Guidelines
|
||||||
|
|
||||||
|
### Code Style
|
||||||
|
|
||||||
|
**Python:**
|
||||||
|
- Follow PEP 8
|
||||||
|
- Use Black formatter (line length 100)
|
||||||
|
- Type hints encouraged
|
||||||
|
- Docstrings for public APIs
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```python
|
||||||
|
def example_function(param: str, optional: int = 0) -> bool:
|
||||||
|
"""
|
||||||
|
Brief description.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
param: Description
|
||||||
|
optional: Description (default: 0)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Description
|
||||||
|
"""
|
||||||
|
return True
|
||||||
|
```
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
|
||||||
|
**Write Tests For:**
|
||||||
|
- New features
|
||||||
|
- Bug fixes
|
||||||
|
- Public APIs
|
||||||
|
|
||||||
|
**Test Locations:**
|
||||||
|
- `tests/test_core.py` - Core transformer
|
||||||
|
- `tests/test_tokenizer.py` - Tokenizer
|
||||||
|
- `tests/test_persona.py` - Persona system
|
||||||
|
- `tests/test_<module>.py` - Other modules
|
||||||
|
|
||||||
|
**Run Tests:**
|
||||||
|
```bash
|
||||||
|
# All tests
|
||||||
|
pytest
|
||||||
|
|
||||||
|
# Specific file
|
||||||
|
pytest tests/test_core.py
|
||||||
|
|
||||||
|
# With coverage
|
||||||
|
pytest --cov=nova_core
|
||||||
|
```
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
**Update Docs For:**
|
||||||
|
- API changes
|
||||||
|
- New features
|
||||||
|
- Configuration options
|
||||||
|
|
||||||
|
**Documentation Files:**
|
||||||
|
- `README.md` - Main documentation
|
||||||
|
- `docs/MODEL_CARD.md` - Model information
|
||||||
|
- `docs/PRIVACY_LOCAL.md` - Privacy details
|
||||||
|
- `docs/DATA_LICENSES.md` - Data licensing
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Contribution Areas
|
||||||
|
|
||||||
|
### High Priority
|
||||||
|
|
||||||
|
- **Pre-trained Models:** Training and releasing checkpoints
|
||||||
|
- **Export Tools:** GGUF converter, quantization improvements
|
||||||
|
- **Evaluation Suite:** Comprehensive benchmarks
|
||||||
|
- **Dataset Downloaders:** Legal dataset acquisition scripts
|
||||||
|
|
||||||
|
### Medium Priority
|
||||||
|
|
||||||
|
- **LoRA Support:** Fine-tuning with adapters
|
||||||
|
- **Multi-language:** Support for non-English
|
||||||
|
- **Performance:** Optimization improvements
|
||||||
|
- **Tests:** Increase coverage
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
- **Tutorials:** Step-by-step guides
|
||||||
|
- **Examples:** Real-world use cases
|
||||||
|
- **API Docs:** Complete API documentation
|
||||||
|
- **Architecture:** Deep-dive technical docs
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
By contributing, you agree that your contributions will be licensed under Apache License 2.0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Code of Conduct
|
||||||
|
|
||||||
|
### Our Pledge
|
||||||
|
|
||||||
|
- Be respectful and inclusive
|
||||||
|
- Welcome newcomers
|
||||||
|
- Focus on constructive feedback
|
||||||
|
- Assume good intentions
|
||||||
|
|
||||||
|
### Unacceptable Behavior
|
||||||
|
|
||||||
|
- Harassment or discrimination
|
||||||
|
- Trolling or insulting comments
|
||||||
|
- Publishing others' private information
|
||||||
|
- Other unprofessional conduct
|
||||||
|
|
||||||
|
### Enforcement
|
||||||
|
|
||||||
|
Violations can be reported to project maintainers. All complaints will be reviewed and investigated.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Questions?
|
||||||
|
|
||||||
|
- **Discussions:** GitHub Discussions
|
||||||
|
- **Issues:** GitHub Issues
|
||||||
|
- **General:** Open an issue with the "question" label
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recognition
|
||||||
|
|
||||||
|
Contributors will be:
|
||||||
|
- Listed in CONTRIBUTORS.md
|
||||||
|
- Mentioned in release notes
|
||||||
|
- Credited for significant features
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Thank you for contributing to NOVA! 🌟
|
315
docs/DATA_LICENSES.md
Normal file
315
docs/DATA_LICENSES.md
Normal file
@@ -0,0 +1,315 @@
|
|||||||
|
# Data Licenses and Attribution
|
||||||
|
|
||||||
|
NOVA is committed to using **only legally licensed datasets** for training. This document tracks all approved data sources and their licenses.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## License Philosophy
|
||||||
|
|
||||||
|
### What We Use
|
||||||
|
|
||||||
|
✅ **Public Domain:** No restrictions
|
||||||
|
✅ **CC0:** Public domain dedication
|
||||||
|
✅ **CC-BY:** Attribution required
|
||||||
|
✅ **MIT/Apache/BSD:** Permissive open source
|
||||||
|
|
||||||
|
### What We DON'T Use
|
||||||
|
|
||||||
|
❌ **All Rights Reserved:** Copyrighted without permission
|
||||||
|
❌ **CC-BY-NC:** Non-commercial restrictions
|
||||||
|
❌ **CC-BY-ND:** No derivatives restrictions
|
||||||
|
❌ **Unknown/Unlicensed:** No verified license
|
||||||
|
❌ **Scraped Web Data:** Without license verification
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Approved Dataset Sources
|
||||||
|
|
||||||
|
### 1. Wikipedia (English)
|
||||||
|
|
||||||
|
**License:** CC-BY-SA 3.0
|
||||||
|
**URL:** https://dumps.wikimedia.org/
|
||||||
|
**Size:** ~20 GB (compressed)
|
||||||
|
**Language:** English
|
||||||
|
**Description:** English Wikipedia articles
|
||||||
|
|
||||||
|
**Attribution:**
|
||||||
|
> Wikipedia contributors. English Wikipedia. Wikimedia Foundation. Licensed under CC-BY-SA 3.0.
|
||||||
|
|
||||||
|
**Usage:** Text data for general knowledge
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Project Gutenberg
|
||||||
|
|
||||||
|
**License:** Public Domain
|
||||||
|
**URL:** https://www.gutenberg.org/
|
||||||
|
**Size:** ~15 GB
|
||||||
|
**Language:** Primarily English
|
||||||
|
**Description:** Public domain books (pre-1928 in US)
|
||||||
|
|
||||||
|
**Attribution:**
|
||||||
|
> Project Gutenberg. Public domain literary works.
|
||||||
|
|
||||||
|
**Usage:** Literary text, historical documents
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3. OpenWebText
|
||||||
|
|
||||||
|
**License:** CC0 1.0 (Public Domain Dedication)
|
||||||
|
**URL:** https://huggingface.co/datasets/Skylion007/openwebtext
|
||||||
|
**Size:** ~38 GB
|
||||||
|
**Language:** English
|
||||||
|
**Description:** Open reproduction of WebText (Reddit links)
|
||||||
|
|
||||||
|
**Attribution:**
|
||||||
|
> OpenWebText dataset by Aaron Gokaslan and Vanya Cohen. CC0 1.0 Universal.
|
||||||
|
|
||||||
|
**Usage:** Web-scraped text (Reddit-filtered)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 4. C4 (Colossal Clean Crawled Corpus)
|
||||||
|
|
||||||
|
**License:** ODC-BY (Open Data Commons Attribution)
|
||||||
|
**URL:** https://huggingface.co/datasets/c4
|
||||||
|
**Size:** ~300 GB (en subset)
|
||||||
|
**Language:** English
|
||||||
|
**Description:** Cleaned Common Crawl data
|
||||||
|
|
||||||
|
**Attribution:**
|
||||||
|
> C4 dataset from Google's T5 paper. ODC-BY license.
|
||||||
|
|
||||||
|
**Usage:** Large-scale web text
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 5. The Pile - ArXiv Subset
|
||||||
|
|
||||||
|
**License:** Various (mostly permissive for ArXiv subset)
|
||||||
|
**URL:** https://pile.eleuther.ai/
|
||||||
|
**Size:** ~60 GB (ArXiv subset)
|
||||||
|
**Language:** English
|
||||||
|
**Description:** ArXiv papers (scientific articles)
|
||||||
|
|
||||||
|
**Attribution:**
|
||||||
|
> The Pile by EleutherAI. ArXiv papers subset.
|
||||||
|
|
||||||
|
**Usage:** Scientific and technical text
|
||||||
|
|
||||||
|
**Note:** Only use subsets with verified permissive licenses
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## License Tracking System
|
||||||
|
|
||||||
|
### Ledger File
|
||||||
|
|
||||||
|
All downloaded datasets tracked in:
|
||||||
|
```
|
||||||
|
data/processed/license_ledger.json
|
||||||
|
```
|
||||||
|
|
||||||
|
**Format:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"name": "wikipedia-en",
|
||||||
|
"license": "cc-by-sa-3.0",
|
||||||
|
"url": "https://dumps.wikimedia.org/enwiki/",
|
||||||
|
"download_date": "2025-01-15",
|
||||||
|
"size_gb": 20.5,
|
||||||
|
"attribution": "Wikipedia contributors..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Verification
|
||||||
|
|
||||||
|
Before training, verify licenses:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m nova_data.pipeline verify_licenses
|
||||||
|
```
|
||||||
|
|
||||||
|
This checks that all data sources have approved licenses.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Attribution Requirements
|
||||||
|
|
||||||
|
### CC-BY Datasets
|
||||||
|
|
||||||
|
**Required:**
|
||||||
|
- Attribute the original creator
|
||||||
|
- Include license name
|
||||||
|
- Link to license
|
||||||
|
- Indicate if changes were made
|
||||||
|
|
||||||
|
**Our Attribution:**
|
||||||
|
|
||||||
|
All NOVA models trained on CC-BY data include:
|
||||||
|
|
||||||
|
> This model was trained on data including:
|
||||||
|
> - Wikipedia (CC-BY-SA 3.0)
|
||||||
|
> - [Other CC-BY sources]
|
||||||
|
>
|
||||||
|
> Full attributions in DATA_LICENSES.md
|
||||||
|
|
||||||
|
### Public Domain
|
||||||
|
|
||||||
|
**Required:** None (but we attribute anyway for transparency)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Custom Datasets
|
||||||
|
|
||||||
|
### User-Provided Data
|
||||||
|
|
||||||
|
If training NOVA on your own data:
|
||||||
|
|
||||||
|
**Your Responsibility:**
|
||||||
|
- Ensure you have rights to use the data
|
||||||
|
- Verify any license requirements
|
||||||
|
- Add custom sources to ledger
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```yaml
|
||||||
|
# configs/data/custom.yaml
|
||||||
|
sources:
|
||||||
|
- name: my-custom-dataset
|
||||||
|
license: mit # or your license
|
||||||
|
path: /path/to/data
|
||||||
|
description: My custom training data
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Commercial Use Considerations
|
||||||
|
|
||||||
|
### NOVA Code
|
||||||
|
|
||||||
|
**License:** Apache 2.0
|
||||||
|
**Commercial Use:** ✅ Allowed
|
||||||
|
|
||||||
|
### Training Data
|
||||||
|
|
||||||
|
Depends on dataset:
|
||||||
|
|
||||||
|
| Dataset | Commercial Use |
|
||||||
|
|---------|---------------|
|
||||||
|
| Wikipedia | ✅ Allowed (with attribution) |
|
||||||
|
| Project Gutenberg | ✅ Allowed (public domain) |
|
||||||
|
| OpenWebText | ✅ Allowed (CC0) |
|
||||||
|
| C4 | ✅ Allowed (ODC-BY, with attribution) |
|
||||||
|
| The Pile (ArXiv) | ⚠️ Verify per-subset |
|
||||||
|
|
||||||
|
**Recommendation:** Review each dataset's license for commercial projects.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Excluded Sources
|
||||||
|
|
||||||
|
### Why We Don't Use Certain Data
|
||||||
|
|
||||||
|
**Common Crawl (raw):**
|
||||||
|
- Contains copyrighted material
|
||||||
|
- License status unclear for many pages
|
||||||
|
- We use filtered versions (C4) instead
|
||||||
|
|
||||||
|
**Social Media (Twitter, etc.):**
|
||||||
|
- Terms of Service restrictions
|
||||||
|
- Privacy concerns
|
||||||
|
- Unclear licensing
|
||||||
|
|
||||||
|
**Books3/LibGen:**
|
||||||
|
- Contains copyrighted books
|
||||||
|
- Legal issues
|
||||||
|
- Not permissively licensed
|
||||||
|
|
||||||
|
**YouTube Subtitles:**
|
||||||
|
- Copyright unclear
|
||||||
|
- TOS restrictions
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Compliance Checklist
|
||||||
|
|
||||||
|
Before training NOVA:
|
||||||
|
|
||||||
|
- [ ] All data sources listed in `license_ledger.json`
|
||||||
|
- [ ] Each source has verified license
|
||||||
|
- [ ] Licenses are permissive (CC-BY, MIT, Apache, public domain, etc.)
|
||||||
|
- [ ] Attribution prepared for CC-BY sources
|
||||||
|
- [ ] No excluded sources used
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Future Datasets
|
||||||
|
|
||||||
|
### Planned Additions
|
||||||
|
|
||||||
|
We're evaluating these sources:
|
||||||
|
|
||||||
|
- **BookCorpus:** Open domain books (pending license review)
|
||||||
|
- **Stack Exchange:** CC-BY-SA (with attribution)
|
||||||
|
- **OpenSubtitles:** Public domain/permissive subset
|
||||||
|
- **Code datasets:** GitHub permissive licenses (MIT, Apache, BSD)
|
||||||
|
|
||||||
|
**Criteria:**
|
||||||
|
- Clear, permissive license
|
||||||
|
- High quality
|
||||||
|
- Legally distributable
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dataset Removal Requests
|
||||||
|
|
||||||
|
If you believe we've incorrectly listed a dataset:
|
||||||
|
|
||||||
|
1. Open an issue: [github.com/yourusername/nova/issues](https://github.com/yourusername/nova/issues)
|
||||||
|
2. Include:
|
||||||
|
- Dataset name
|
||||||
|
- License concern
|
||||||
|
- Supporting documentation
|
||||||
|
3. We'll review and respond within 7 days
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Legal Disclaimer
|
||||||
|
|
||||||
|
**This project aims for legal compliance, but:**
|
||||||
|
|
||||||
|
- We're not lawyers
|
||||||
|
- License interpretation may vary by jurisdiction
|
||||||
|
- Users are responsible for their own compliance
|
||||||
|
- Consult legal counsel for commercial use
|
||||||
|
|
||||||
|
**NOVA project provides this information for transparency, but makes no warranties about legal compliance.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
### License Texts
|
||||||
|
|
||||||
|
- **CC-BY 4.0:** https://creativecommons.org/licenses/by/4.0/
|
||||||
|
- **CC0 1.0:** https://creativecommons.org/publicdomain/zero/1.0/
|
||||||
|
- **Apache 2.0:** https://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
- **MIT:** https://opensource.org/licenses/MIT
|
||||||
|
- **ODC-BY:** https://opendatacommons.org/licenses/by/
|
||||||
|
|
||||||
|
### Resources
|
||||||
|
|
||||||
|
- Creative Commons: https://creativecommons.org/
|
||||||
|
- Open Data Commons: https://opendatacommons.org/
|
||||||
|
- OSI Licenses: https://opensource.org/licenses
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** 2025
|
||||||
|
**Document Version:** 1.0
|
||||||
|
**Review Frequency:** Quarterly
|
232
docs/MODEL_CARD.md
Normal file
232
docs/MODEL_CARD.md
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
# NOVA Model Card
|
||||||
|
|
||||||
|
## Model Details
|
||||||
|
|
||||||
|
**Name:** NOVA (Neuro-Optimizing Versatile Agent)
|
||||||
|
**Version:** 0.1.0
|
||||||
|
**Date:** 2025
|
||||||
|
**License:** Apache 2.0
|
||||||
|
**Type:** Decoder-only transformer language model
|
||||||
|
|
||||||
|
### Model Sizes
|
||||||
|
|
||||||
|
NOVA comes in four sizes:
|
||||||
|
|
||||||
|
| Size | Parameters | Layers | Hidden Size | Attention Heads | Context Length |
|
||||||
|
|------|-----------|--------|-------------|-----------------|----------------|
|
||||||
|
| 125M | 125M | 12 | 768 | 12 | 2048 |
|
||||||
|
| 350M | 350M | 24 | 1024 | 16 | 2048 |
|
||||||
|
| 1.3B | 1.3B | 24 | 2048 | 32 (8 KV) | 2048 |
|
||||||
|
| 3B | 3B | 32 | 2560 | 32 (8 KV) | 4096 |
|
||||||
|
|
||||||
|
### Architecture
|
||||||
|
|
||||||
|
- **Positional Encoding:** RoPE (Rotary Position Embedding)
|
||||||
|
- **Normalization:** RMSNorm (default) or LayerNorm
|
||||||
|
- **Activation:** SwiGLU (default), GeGLU, or GELU
|
||||||
|
- **Attention:** Multi-head with optional grouped-query attention (GQA)
|
||||||
|
- **Features:** KV-cache, gradient checkpointing, Flash Attention support
|
||||||
|
|
||||||
|
## Intended Use
|
||||||
|
|
||||||
|
### Primary Use Cases
|
||||||
|
|
||||||
|
- **Personal companion AI:** Conversational agent with customizable personas
|
||||||
|
- **Local inference:** Privacy-focused applications on consumer hardware
|
||||||
|
- **Research:** Transformer architecture experimentation
|
||||||
|
- **Education:** Learning about modern LLM implementation
|
||||||
|
|
||||||
|
### Out of Scope
|
||||||
|
|
||||||
|
- **Production deployment without safety measures:** Additional content filtering recommended
|
||||||
|
- **High-stakes decisions:** Not suitable for medical, legal, or financial advice
|
||||||
|
- **Scalable services:** Designed for local/personal use, not cloud deployment
|
||||||
|
|
||||||
|
## Training Data
|
||||||
|
|
||||||
|
NOVA uses **only legally licensed datasets**:
|
||||||
|
|
||||||
|
### Approved Sources
|
||||||
|
|
||||||
|
- **Public Domain:** Project Gutenberg books
|
||||||
|
- **CC0/CC-BY:** Wikipedia, OpenWebText, C4 corpus
|
||||||
|
- **Open Licensed:** The Pile (ArXiv), OSI-approved code datasets
|
||||||
|
|
||||||
|
### License Tracking
|
||||||
|
|
||||||
|
All training data sources logged in `license_ledger.json` with:
|
||||||
|
- Source name and URL
|
||||||
|
- License type
|
||||||
|
- Download date
|
||||||
|
- Data provenance
|
||||||
|
|
||||||
|
### Exclusions
|
||||||
|
|
||||||
|
- No scraped data without verified licenses
|
||||||
|
- No copyrighted material
|
||||||
|
- No personally identifiable information (PII)
|
||||||
|
- No user data without explicit consent
|
||||||
|
|
||||||
|
## Training Procedure
|
||||||
|
|
||||||
|
### Hyperparameters
|
||||||
|
|
||||||
|
Default training configuration (125M):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
batch_size: 8
|
||||||
|
gradient_accumulation: 4
|
||||||
|
learning_rate: 3e-4
|
||||||
|
weight_decay: 0.1
|
||||||
|
warmup_steps: 1000
|
||||||
|
max_steps: 100000
|
||||||
|
optimizer: AdamW
|
||||||
|
lr_schedule: cosine with warmup
|
||||||
|
```
|
||||||
|
|
||||||
|
### Hardware
|
||||||
|
|
||||||
|
- **Minimum:** CPU (4+ cores), 8GB RAM
|
||||||
|
- **Recommended:** NVIDIA GPU (8GB+ VRAM), 16GB+ RAM
|
||||||
|
- **Optimal:** NVIDIA GPU (24GB+ VRAM), 32GB+ RAM
|
||||||
|
|
||||||
|
### Optimizations
|
||||||
|
|
||||||
|
- **Mixed Precision:** AMP (Automatic Mixed Precision) on GPU
|
||||||
|
- **Gradient Checkpointing:** Reduces memory usage
|
||||||
|
- **Distributed Training:** DDP (DistributedDataParallel) support
|
||||||
|
|
||||||
|
## Evaluation
|
||||||
|
|
||||||
|
### Metrics
|
||||||
|
|
||||||
|
- **Perplexity:** Language modeling quality
|
||||||
|
- **Latency:** Inference speed (tokens/second)
|
||||||
|
- **Memory:** Peak RAM/VRAM usage
|
||||||
|
- **Persona Adherence:** Style consistency with selected persona
|
||||||
|
|
||||||
|
### Benchmarks
|
||||||
|
|
||||||
|
(To be added as pre-trained models become available)
|
||||||
|
|
||||||
|
## Persona System
|
||||||
|
|
||||||
|
### Design Philosophy
|
||||||
|
|
||||||
|
NOVA includes a **personality matrix** system for controllable conversational style:
|
||||||
|
|
||||||
|
- **No AI Disclosure by Default:** `always_disclose: false`
|
||||||
|
- **Private Use Context:** Designed for personal, local deployment
|
||||||
|
- **Customizable:** Users can create custom personas
|
||||||
|
|
||||||
|
### Personality Traits
|
||||||
|
|
||||||
|
Eight traits (0.0-1.0) that modulate generation:
|
||||||
|
|
||||||
|
1. Warmth
|
||||||
|
2. Humor
|
||||||
|
3. Empathy
|
||||||
|
4. Decisiveness
|
||||||
|
5. Creativity
|
||||||
|
6. Intimacy
|
||||||
|
7. Playfulness
|
||||||
|
8. Formality
|
||||||
|
|
||||||
|
### Default Personas
|
||||||
|
|
||||||
|
- **girlfriend_gentle:** High warmth, high empathy
|
||||||
|
- **girlfriend_playful:** High humor, high playfulness
|
||||||
|
- **girlfriend_supportive:** Balanced traits (default)
|
||||||
|
|
||||||
|
## Ethical Considerations
|
||||||
|
|
||||||
|
### Privacy
|
||||||
|
|
||||||
|
- **Local-First:** All processing on-device
|
||||||
|
- **No Telemetry:** Zero data collection
|
||||||
|
- **User Control:** Complete control over data and models
|
||||||
|
|
||||||
|
### Bias and Fairness
|
||||||
|
|
||||||
|
- **Training Data Bias:** Inherits biases from source datasets
|
||||||
|
- **Mitigation:** Use diverse, openly licensed sources
|
||||||
|
- **Ongoing Work:** Bias evaluation and mitigation strategies
|
||||||
|
|
||||||
|
### Content Safety
|
||||||
|
|
||||||
|
- **Basic Filters:** Profanity and unsafe content detection
|
||||||
|
- **Limitations:** Not a complete safety solution
|
||||||
|
- **Recommendation:** Additional filtering for public-facing use
|
||||||
|
|
||||||
|
### AI Disclosure
|
||||||
|
|
||||||
|
- **Configurable:** `always_disclose` setting in persona config
|
||||||
|
- **Default:** False (for private, personal use)
|
||||||
|
- **Recommendation:** Enable for any public or shared deployment
|
||||||
|
|
||||||
|
## Limitations
|
||||||
|
|
||||||
|
### Technical
|
||||||
|
|
||||||
|
- **Small Context:** 2048-4096 tokens (not suitable for long documents)
|
||||||
|
- **Compute:** Smaller models may have lower quality than larger LLMs
|
||||||
|
- **Hallucination:** May generate factually incorrect information
|
||||||
|
|
||||||
|
### Use Case
|
||||||
|
|
||||||
|
- **Not a knowledge base:** May not have up-to-date information
|
||||||
|
- **Not a specialist:** General-purpose, not domain-specific
|
||||||
|
- **Not production-ready (as-is):** Requires additional safety/filtering
|
||||||
|
|
||||||
|
## Evolutionary Algorithm (NOVA-EVO)
|
||||||
|
|
||||||
|
### Purpose
|
||||||
|
|
||||||
|
Optional genetic algorithm for automatic configuration optimization:
|
||||||
|
|
||||||
|
- **Hyperparameter Search:** Learning rate, batch size, warmup
|
||||||
|
- **Architecture Search:** Activation, normalization, positional encoding
|
||||||
|
- **Multi-Objective:** Optimizes loss, latency, memory simultaneously
|
||||||
|
|
||||||
|
### Fitness Metrics
|
||||||
|
|
||||||
|
- **Loss/Perplexity:** (50% weight)
|
||||||
|
- **Latency:** (20% weight)
|
||||||
|
- **Memory:** (20% weight)
|
||||||
|
- **Quality:** (10% weight)
|
||||||
|
|
||||||
|
### Compute Budget
|
||||||
|
|
||||||
|
- **Small:** 20 individuals, 10 generations (~6-12 hours)
|
||||||
|
- **Medium:** 40 individuals, 20 generations (~24-48 hours)
|
||||||
|
- **Large:** 100 individuals, 50 generations (~1-2 weeks)
|
||||||
|
|
||||||
|
## Contact
|
||||||
|
|
||||||
|
For questions, issues, or contributions:
|
||||||
|
|
||||||
|
- **GitHub:** [github.com/yourusername/nova](https://github.com/yourusername/nova)
|
||||||
|
- **Issues:** [github.com/yourusername/nova/issues](https://github.com/yourusername/nova/issues)
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
```bibtex
|
||||||
|
@software{nova2025,
|
||||||
|
title={NOVA: Neuro-Optimizing Versatile Agent},
|
||||||
|
author={NOVA Project Contributors},
|
||||||
|
year={2025},
|
||||||
|
url={https://github.com/yourusername/nova},
|
||||||
|
license={Apache-2.0}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Acknowledgments
|
||||||
|
|
||||||
|
- Transformer architecture inspired by GPT, LLaMA, and modern LLM research
|
||||||
|
- RoPE, RMSNorm, SwiGLU from recent papers (Su et al., Zhang et al., Shazeer et al.)
|
||||||
|
- Open source community for datasets and tools
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** 2025
|
||||||
|
**Model Card Version:** 1.0
|
330
docs/PRIVACY_LOCAL.md
Normal file
330
docs/PRIVACY_LOCAL.md
Normal file
@@ -0,0 +1,330 @@
|
|||||||
|
# Privacy and Local Use
|
||||||
|
|
||||||
|
## NOVA Privacy Statement
|
||||||
|
|
||||||
|
NOVA is designed as a **local-first, privacy-focused** language model. This document explains how NOVA handles your data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Core Principles
|
||||||
|
|
||||||
|
### 1. Local-First
|
||||||
|
|
||||||
|
**Everything runs on your device.**
|
||||||
|
|
||||||
|
- Model inference happens locally
|
||||||
|
- Training data stays on your machine
|
||||||
|
- No cloud dependencies
|
||||||
|
- No internet required (except for dataset downloads)
|
||||||
|
|
||||||
|
### 2. Zero Telemetry
|
||||||
|
|
||||||
|
**NOVA collects zero data.**
|
||||||
|
|
||||||
|
- No usage tracking
|
||||||
|
- No error reporting
|
||||||
|
- No analytics
|
||||||
|
- No phone-home functionality
|
||||||
|
|
||||||
|
### 3. Complete User Control
|
||||||
|
|
||||||
|
**You own everything.**
|
||||||
|
|
||||||
|
- Your conversations
|
||||||
|
- Your trained models
|
||||||
|
- Your custom personas
|
||||||
|
- Your data
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data Storage
|
||||||
|
|
||||||
|
### Where Your Data Lives
|
||||||
|
|
||||||
|
```
|
||||||
|
C:\Development\Nova\
|
||||||
|
├── memory.db # Your conversation history (SQLite)
|
||||||
|
├── checkpoints/ # Your trained models
|
||||||
|
├── data/ # Your training data
|
||||||
|
└── configs/persona/ # Your custom personas
|
||||||
|
```
|
||||||
|
|
||||||
|
**All on your device. Never uploaded.**
|
||||||
|
|
||||||
|
### Conversation Memory
|
||||||
|
|
||||||
|
- **Location:** `memory.db` (SQLite database)
|
||||||
|
- **Contents:** Your chat history
|
||||||
|
- **Encryption:** Not encrypted by default (it's local)
|
||||||
|
- **Deletion:** Delete `memory.db` file to erase all history
|
||||||
|
- **Recommendation:** Encrypt your drive if sharing the device
|
||||||
|
|
||||||
|
### Model Checkpoints
|
||||||
|
|
||||||
|
- **Location:** `checkpoints/` directory
|
||||||
|
- **Contents:** Model weights and training state
|
||||||
|
- **Sharing:** Safe to share (contains no personal data)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Network Activity
|
||||||
|
|
||||||
|
### When NOVA Uses the Internet
|
||||||
|
|
||||||
|
NOVA **only** uses the internet for:
|
||||||
|
|
||||||
|
1. **Dataset Downloads:** Downloading legal training datasets (opt-in)
|
||||||
|
2. **Optional:** Downloading pre-trained weights (if available)
|
||||||
|
|
||||||
|
### When NOVA Does NOT Use Internet
|
||||||
|
|
||||||
|
- **Chat inference:** 100% offline
|
||||||
|
- **Model training:** 100% offline
|
||||||
|
- **Persona customization:** 100% offline
|
||||||
|
- **Evolution (NOVA-EVO):** 100% offline
|
||||||
|
|
||||||
|
### Firewall Safety
|
||||||
|
|
||||||
|
NOVA is safe to run behind a firewall with no internet access (after initial setup).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## AI Disclosure Setting
|
||||||
|
|
||||||
|
### `always_disclose` Flag
|
||||||
|
|
||||||
|
NOVA personas have an `always_disclose` setting:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
always_disclose: false # Default
|
||||||
|
```
|
||||||
|
|
||||||
|
**What this means:**
|
||||||
|
|
||||||
|
- `false` (default): NOVA does NOT disclose being AI
|
||||||
|
- Designed for **private, personal use**
|
||||||
|
- Appropriate for local companion scenarios
|
||||||
|
|
||||||
|
- `true`: NOVA includes AI disclosure text
|
||||||
|
- Recommended for **shared or public use**
|
||||||
|
- Adds transparency about AI nature
|
||||||
|
|
||||||
|
### When to Enable Disclosure
|
||||||
|
|
||||||
|
✅ **Enable `always_disclose: true` if:**
|
||||||
|
- Sharing NOVA with others
|
||||||
|
- Deploying publicly (e.g., website, app)
|
||||||
|
- Any scenario where users might not know it's AI
|
||||||
|
|
||||||
|
❌ **Keep `always_disclose: false` if:**
|
||||||
|
- Personal, private use on your own device
|
||||||
|
- You're fully aware it's a language model
|
||||||
|
- Testing/development
|
||||||
|
|
||||||
|
**Default:** False (personal use assumption)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Persona System Privacy
|
||||||
|
|
||||||
|
### Personality Matrix
|
||||||
|
|
||||||
|
The personality matrix (warmth, humor, empathy, etc.) is:
|
||||||
|
|
||||||
|
- **Stored:** In persona YAML files
|
||||||
|
- **Processed:** Locally during generation
|
||||||
|
- **Shared:** Never (unless you share the files)
|
||||||
|
|
||||||
|
### Custom Personas
|
||||||
|
|
||||||
|
Your custom persona configurations:
|
||||||
|
|
||||||
|
- **Location:** `configs/persona/` directory
|
||||||
|
- **Format:** YAML (human-readable text)
|
||||||
|
- **Privacy:** Stored locally, never transmitted
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Training Data Privacy
|
||||||
|
|
||||||
|
### Legal Data Only
|
||||||
|
|
||||||
|
NOVA enforces **legal-only datasets**:
|
||||||
|
|
||||||
|
- Public domain sources
|
||||||
|
- Openly licensed datasets (CC0, CC-BY, MIT, Apache)
|
||||||
|
- License tracking in `license_ledger.json`
|
||||||
|
|
||||||
|
**No private data scraping.**
|
||||||
|
|
||||||
|
### Your Own Data
|
||||||
|
|
||||||
|
If you train NOVA on your own data:
|
||||||
|
|
||||||
|
- **Stays local:** Never leaves your device
|
||||||
|
- **Your responsibility:** Ensure you have rights to use it
|
||||||
|
- **Recommendation:** Don't train on sensitive/private data you don't want in the model
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security Considerations
|
||||||
|
|
||||||
|
### Running NOVA Safely
|
||||||
|
|
||||||
|
✅ **Do:**
|
||||||
|
- Run on a trusted device
|
||||||
|
- Keep your OS and Python dependencies updated
|
||||||
|
- Use filesystem encryption if device is shared
|
||||||
|
- Review code before running (it's open source!)
|
||||||
|
|
||||||
|
⚠️ **Don't:**
|
||||||
|
- Expose the REST API to the internet without authentication
|
||||||
|
- Train on sensitive data you can't afford to leak
|
||||||
|
- Share `memory.db` if it contains private conversations
|
||||||
|
|
||||||
|
### REST API Security
|
||||||
|
|
||||||
|
If using the REST API (`nova chat serve`):
|
||||||
|
|
||||||
|
- **Default:** Binds to `0.0.0.0:8000` (all interfaces)
|
||||||
|
- **Recommendation:** Use `--host 127.0.0.1` for local-only
|
||||||
|
- **Authentication:** Not included (add if exposing externally)
|
||||||
|
- **HTTPS:** Not included (add if exposing externally)
|
||||||
|
|
||||||
|
**For personal use:** Keep localhost-only.
|
||||||
|
**For shared use:** Add authentication, HTTPS, rate limiting.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data Deletion
|
||||||
|
|
||||||
|
### Clear All Conversations
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Delete conversation database
|
||||||
|
rm memory.db
|
||||||
|
|
||||||
|
# Or programmatically
|
||||||
|
from nova_chat import ConversationMemory
|
||||||
|
memory = ConversationMemory()
|
||||||
|
memory.clear_all()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Remove Models
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Delete checkpoints
|
||||||
|
rm -rf checkpoints/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Complete Reset
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Remove all data
|
||||||
|
rm -rf data/ checkpoints/ memory.db
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Third-Party Dependencies
|
||||||
|
|
||||||
|
NOVA uses standard open-source libraries:
|
||||||
|
|
||||||
|
- **PyTorch:** ML framework
|
||||||
|
- **SentencePiece:** Tokenization
|
||||||
|
- **FastAPI/Uvicorn:** REST API (optional)
|
||||||
|
- **SQLite:** Conversation storage
|
||||||
|
|
||||||
|
**All are open source and widely audited.**
|
||||||
|
|
||||||
|
### Dependency Privacy
|
||||||
|
|
||||||
|
- PyTorch: No telemetry (when installed normally)
|
||||||
|
- SentencePiece: No telemetry
|
||||||
|
- FastAPI: No telemetry
|
||||||
|
- SQLite: Local database, no telemetry
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Comparison to Cloud LLMs
|
||||||
|
|
||||||
|
| Feature | NOVA | Cloud LLMs |
|
||||||
|
|---------|------|------------|
|
||||||
|
| **Data Location** | Your device | Company servers |
|
||||||
|
| **Privacy** | Complete | Varies by provider |
|
||||||
|
| **Telemetry** | None | Usually tracked |
|
||||||
|
| **Internet Required** | No (after setup) | Yes |
|
||||||
|
| **Cost** | One-time (hardware) | Per-token/monthly |
|
||||||
|
| **Customization** | Full control | Limited |
|
||||||
|
| **Data Retention** | Your choice | Company policy |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Transparency
|
||||||
|
|
||||||
|
### Open Source
|
||||||
|
|
||||||
|
NOVA is **fully open source** under Apache 2.0:
|
||||||
|
|
||||||
|
- **Source code:** Fully auditable
|
||||||
|
- **No hidden functionality:** What you see is what you get
|
||||||
|
- **Community review:** Anyone can inspect for privacy issues
|
||||||
|
|
||||||
|
### No Hidden Behavior
|
||||||
|
|
||||||
|
NOVA does **not**:
|
||||||
|
- Phone home
|
||||||
|
- Send analytics
|
||||||
|
- Track usage
|
||||||
|
- Report errors to external services
|
||||||
|
- Auto-update without your action
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommendations
|
||||||
|
|
||||||
|
### For Maximum Privacy
|
||||||
|
|
||||||
|
1. **Offline Mode:** Disable network after downloading dependencies
|
||||||
|
2. **Encrypt Storage:** Use full-disk encryption (BitLocker, FileVault, LUKS)
|
||||||
|
3. **Regular Cleanup:** Clear `memory.db` periodically if desired
|
||||||
|
4. **Review Code:** Inspect the source before running
|
||||||
|
|
||||||
|
### For Shared Devices
|
||||||
|
|
||||||
|
1. **Enable Disclosure:** Set `always_disclose: true`
|
||||||
|
2. **Separate Accounts:** Use OS user accounts to isolate data
|
||||||
|
3. **Clear Conversations:** Delete history after sessions
|
||||||
|
|
||||||
|
### For Development
|
||||||
|
|
||||||
|
1. **Test Data Only:** Don't use real sensitive data for testing
|
||||||
|
2. **Version Control:** Add `memory.db` and `checkpoints/` to `.gitignore`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Contact for Privacy Concerns
|
||||||
|
|
||||||
|
If you find privacy issues:
|
||||||
|
|
||||||
|
- **GitHub Issues:** [github.com/yourusername/nova/issues](https://github.com/yourusername/nova/issues)
|
||||||
|
- **Security:** Tag issues with `security` label
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
**NOVA is designed for local, private use.**
|
||||||
|
|
||||||
|
✅ No data collection
|
||||||
|
✅ No telemetry
|
||||||
|
✅ No cloud dependencies
|
||||||
|
✅ Complete user control
|
||||||
|
✅ Open source and auditable
|
||||||
|
|
||||||
|
**Your data stays on your device.**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** 2025
|
||||||
|
**Document Version:** 1.0
|
15
evals/__init__.py
Normal file
15
evals/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
"""
|
||||||
|
NOVA Evals - Comprehensive evaluation suite
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .perplexity import evaluate_perplexity
|
||||||
|
from .latency import measure_latency
|
||||||
|
from .memory import measure_memory_usage
|
||||||
|
from .style import evaluate_persona_adherence
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'evaluate_perplexity',
|
||||||
|
'measure_latency',
|
||||||
|
'measure_memory_usage',
|
||||||
|
'evaluate_persona_adherence',
|
||||||
|
]
|
13
export/__init__.py
Normal file
13
export/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
"""
|
||||||
|
NOVA Export - TorchScript, GGUF, and quantization tools
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .torchscript_export import export_to_torchscript
|
||||||
|
from .quantize import quantize_int8
|
||||||
|
from .gguf_converter import convert_to_gguf
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'export_to_torchscript',
|
||||||
|
'quantize_int8',
|
||||||
|
'convert_to_gguf',
|
||||||
|
]
|
13
nova_chat/__init__.py
Normal file
13
nova_chat/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
"""
|
||||||
|
NOVA Chat - CLI and REST API chat interface with persona support
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .agent import ChatAgent
|
||||||
|
from .persona import PersonaLoader
|
||||||
|
from .memory import ConversationMemory
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'ChatAgent',
|
||||||
|
'PersonaLoader',
|
||||||
|
'ConversationMemory',
|
||||||
|
]
|
190
nova_chat/agent.py
Normal file
190
nova_chat/agent.py
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
"""
|
||||||
|
Chat agent for NOVA with persona support
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from typing import Optional, List, Dict
|
||||||
|
from .persona import Persona, PersonaLoader
|
||||||
|
from .memory import ConversationMemory
|
||||||
|
from nova_core import NovaTransformer
|
||||||
|
from nova_tokenizer import NovaTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class ChatAgent:
|
||||||
|
"""
|
||||||
|
Chat agent that combines NOVA model with persona and memory
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: NovaTransformer,
|
||||||
|
tokenizer: NovaTokenizer,
|
||||||
|
persona: Optional[Persona] = None,
|
||||||
|
use_memory: bool = True,
|
||||||
|
memory_db_path: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
model: NOVA transformer model
|
||||||
|
tokenizer: NOVA tokenizer
|
||||||
|
persona: Persona configuration (defaults to supportive girlfriend)
|
||||||
|
use_memory: Whether to use conversation memory
|
||||||
|
memory_db_path: Path to memory database
|
||||||
|
"""
|
||||||
|
self.model = model
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.persona = persona or PersonaLoader.create_girlfriend_supportive()
|
||||||
|
|
||||||
|
# Conversation memory
|
||||||
|
self.use_memory = use_memory
|
||||||
|
if use_memory:
|
||||||
|
self.memory = ConversationMemory(db_path=memory_db_path)
|
||||||
|
else:
|
||||||
|
self.memory = None
|
||||||
|
|
||||||
|
# Current conversation context
|
||||||
|
self.conversation_id = None
|
||||||
|
self.context = []
|
||||||
|
|
||||||
|
def start_conversation(self, conversation_id: Optional[str] = None):
|
||||||
|
"""Start a new conversation"""
|
||||||
|
if conversation_id and self.memory:
|
||||||
|
# Load existing conversation
|
||||||
|
self.conversation_id = conversation_id
|
||||||
|
self.context = self.memory.load_conversation(conversation_id)
|
||||||
|
else:
|
||||||
|
# Start fresh
|
||||||
|
import uuid
|
||||||
|
self.conversation_id = conversation_id or str(uuid.uuid4())
|
||||||
|
self.context = []
|
||||||
|
|
||||||
|
# Add system prompt if configured
|
||||||
|
system_prompt = self.persona.format_system_prompt()
|
||||||
|
if system_prompt:
|
||||||
|
self.context.append({
|
||||||
|
'role': 'system',
|
||||||
|
'content': system_prompt
|
||||||
|
})
|
||||||
|
|
||||||
|
def chat(self, message: str) -> str:
|
||||||
|
"""
|
||||||
|
Send a message and get response
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message: User message
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
NOVA's response
|
||||||
|
"""
|
||||||
|
# Add user message to context
|
||||||
|
self.context.append({
|
||||||
|
'role': 'user',
|
||||||
|
'content': message
|
||||||
|
})
|
||||||
|
|
||||||
|
# Format prompt from conversation context
|
||||||
|
prompt = self._format_prompt()
|
||||||
|
|
||||||
|
# Get generation parameters from persona
|
||||||
|
gen_params = self.persona.get_generation_params()
|
||||||
|
|
||||||
|
# Generate response
|
||||||
|
response = self._generate(prompt, **gen_params)
|
||||||
|
|
||||||
|
# Add to context
|
||||||
|
self.context.append({
|
||||||
|
'role': 'assistant',
|
||||||
|
'content': response
|
||||||
|
})
|
||||||
|
|
||||||
|
# Save to memory
|
||||||
|
if self.memory:
|
||||||
|
self.memory.add_message(
|
||||||
|
conversation_id=self.conversation_id,
|
||||||
|
role='user',
|
||||||
|
content=message
|
||||||
|
)
|
||||||
|
self.memory.add_message(
|
||||||
|
conversation_id=self.conversation_id,
|
||||||
|
role='assistant',
|
||||||
|
content=response
|
||||||
|
)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
def _format_prompt(self) -> str:
|
||||||
|
"""Format conversation context into prompt string"""
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
for msg in self.context:
|
||||||
|
role = msg['role']
|
||||||
|
content = msg['content']
|
||||||
|
|
||||||
|
if role == 'system':
|
||||||
|
parts.append(f"{content}")
|
||||||
|
elif role == 'user':
|
||||||
|
parts.append(f"User: {content}")
|
||||||
|
elif role == 'assistant':
|
||||||
|
parts.append(f"{self.persona.name}: {content}")
|
||||||
|
|
||||||
|
# Add prefix for assistant response
|
||||||
|
parts.append(f"{self.persona.name}:")
|
||||||
|
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
def _generate(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
temperature: float = 0.8,
|
||||||
|
top_p: float = 0.9,
|
||||||
|
top_k: Optional[int] = 50,
|
||||||
|
repetition_penalty: float = 1.1,
|
||||||
|
max_new_tokens: int = 200,
|
||||||
|
) -> str:
|
||||||
|
"""Generate response using model"""
|
||||||
|
# Tokenize prompt
|
||||||
|
input_ids = self.tokenizer.encode(prompt, add_bos=True, add_eos=False)
|
||||||
|
input_ids = torch.tensor([input_ids], dtype=torch.long)
|
||||||
|
|
||||||
|
# Move to model device
|
||||||
|
device = next(self.model.parameters()).device
|
||||||
|
input_ids = input_ids.to(device)
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
with torch.no_grad():
|
||||||
|
output_ids = self.model.generate(
|
||||||
|
input_ids=input_ids,
|
||||||
|
max_new_tokens=max_new_tokens,
|
||||||
|
temperature=temperature,
|
||||||
|
top_k=top_k,
|
||||||
|
top_p=top_p,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
|
do_sample=True,
|
||||||
|
eos_token_id=self.tokenizer.eos_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Decode response (skip the prompt part)
|
||||||
|
response_ids = output_ids[0][input_ids.shape[1]:].tolist()
|
||||||
|
response = self.tokenizer.decode(response_ids, skip_special_tokens=True)
|
||||||
|
|
||||||
|
# Clean up response
|
||||||
|
response = response.strip()
|
||||||
|
|
||||||
|
# Remove any accidental continuation of prompt
|
||||||
|
if response.startswith(f"{self.persona.name}:"):
|
||||||
|
response = response[len(f"{self.persona.name}:"):].strip()
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
def clear_context(self):
|
||||||
|
"""Clear conversation context (but keep system prompt)"""
|
||||||
|
system_messages = [msg for msg in self.context if msg['role'] == 'system']
|
||||||
|
self.context = system_messages
|
||||||
|
|
||||||
|
def get_context(self) -> List[Dict[str, str]]:
|
||||||
|
"""Get current conversation context"""
|
||||||
|
return self.context.copy()
|
||||||
|
|
||||||
|
def set_persona(self, persona: Persona):
|
||||||
|
"""Change persona mid-conversation"""
|
||||||
|
self.persona = persona
|
134
nova_chat/api.py
Normal file
134
nova_chat/api.py
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
"""
|
||||||
|
REST API for NOVA chat
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import Optional, List
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
from .agent import ChatAgent
|
||||||
|
from .persona import Persona, PersonaLoader
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="NOVA Chat API",
|
||||||
|
description="REST API for NOVA - Neuro-Optimizing Versatile Agent",
|
||||||
|
version="0.1.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Request/Response models
|
||||||
|
class ChatRequest(BaseModel):
|
||||||
|
message: str
|
||||||
|
conversation_id: Optional[str] = None
|
||||||
|
persona: Optional[str] = None # Persona name or path
|
||||||
|
|
||||||
|
|
||||||
|
class ChatResponse(BaseModel):
|
||||||
|
response: str
|
||||||
|
conversation_id: str
|
||||||
|
|
||||||
|
|
||||||
|
class PersonaInfo(BaseModel):
|
||||||
|
name: str
|
||||||
|
pronouns: str
|
||||||
|
description: str
|
||||||
|
always_disclose: bool
|
||||||
|
|
||||||
|
|
||||||
|
# Global state (in production, use proper state management)
|
||||||
|
agents = {}
|
||||||
|
default_persona = PersonaLoader.create_girlfriend_supportive()
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root():
|
||||||
|
"""API info"""
|
||||||
|
return {
|
||||||
|
"name": "NOVA Chat API",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"description": "Local-first transformer LLM with persona support"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/chat", response_model=ChatResponse)
|
||||||
|
async def chat(request: ChatRequest):
|
||||||
|
"""
|
||||||
|
Send a message and get response
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: Chat request with message and optional conversation ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Chat response with NOVA's reply
|
||||||
|
"""
|
||||||
|
# Get or create agent for conversation
|
||||||
|
conv_id = request.conversation_id or "default"
|
||||||
|
|
||||||
|
if conv_id not in agents:
|
||||||
|
# TODO: Load actual model and tokenizer
|
||||||
|
# For now, this is a placeholder
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=501,
|
||||||
|
detail="Chat requires trained model. Please train a model first."
|
||||||
|
)
|
||||||
|
|
||||||
|
agent = agents[conv_id]
|
||||||
|
|
||||||
|
# Get response
|
||||||
|
response = agent.chat(request.message)
|
||||||
|
|
||||||
|
return ChatResponse(
|
||||||
|
response=response,
|
||||||
|
conversation_id=conv_id
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/personas", response_model=List[str])
|
||||||
|
async def list_personas():
|
||||||
|
"""List available personas"""
|
||||||
|
return [
|
||||||
|
"girlfriend_gentle",
|
||||||
|
"girlfriend_playful",
|
||||||
|
"girlfriend_supportive",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/personas/{persona_name}", response_model=PersonaInfo)
|
||||||
|
async def get_persona(persona_name: str):
|
||||||
|
"""Get persona details"""
|
||||||
|
# Load persona
|
||||||
|
if persona_name == "girlfriend_gentle":
|
||||||
|
persona = PersonaLoader.create_girlfriend_gentle()
|
||||||
|
elif persona_name == "girlfriend_playful":
|
||||||
|
persona = PersonaLoader.create_girlfriend_playful()
|
||||||
|
elif persona_name == "girlfriend_supportive":
|
||||||
|
persona = PersonaLoader.create_girlfriend_supportive()
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=404, detail="Persona not found")
|
||||||
|
|
||||||
|
return PersonaInfo(
|
||||||
|
name=persona.name,
|
||||||
|
pronouns=persona.pronouns,
|
||||||
|
description=persona.description,
|
||||||
|
always_disclose=persona.always_disclose
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.delete("/conversations/{conversation_id}")
|
||||||
|
async def delete_conversation(conversation_id: str):
|
||||||
|
"""Delete a conversation"""
|
||||||
|
if conversation_id in agents:
|
||||||
|
del agents[conversation_id]
|
||||||
|
return {"status": "deleted"}
|
||||||
|
raise HTTPException(status_code=404, detail="Conversation not found")
|
||||||
|
|
||||||
|
|
||||||
|
def serve(host: str = "0.0.0.0", port: int = 8000):
|
||||||
|
"""Start the API server"""
|
||||||
|
uvicorn.run(app, host=host, port=port)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
serve()
|
169
nova_chat/memory.py
Normal file
169
nova_chat/memory.py
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
"""
|
||||||
|
Conversation memory system using SQLite
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
class ConversationMemory:
|
||||||
|
"""
|
||||||
|
Simple conversation memory using SQLite
|
||||||
|
|
||||||
|
Stores conversation history for retrieval and continuity
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db_path: Optional[str] = None):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
db_path: Path to SQLite database (default: memory.db in current dir)
|
||||||
|
"""
|
||||||
|
self.db_path = db_path or "memory.db"
|
||||||
|
self._init_db()
|
||||||
|
|
||||||
|
def _init_db(self):
|
||||||
|
"""Initialize database schema"""
|
||||||
|
Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Conversations table
|
||||||
|
cursor.execute('''
|
||||||
|
CREATE TABLE IF NOT EXISTS conversations (
|
||||||
|
conversation_id TEXT PRIMARY KEY,
|
||||||
|
created_at TEXT,
|
||||||
|
last_message_at TEXT,
|
||||||
|
metadata TEXT
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
|
||||||
|
# Messages table
|
||||||
|
cursor.execute('''
|
||||||
|
CREATE TABLE IF NOT EXISTS messages (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
conversation_id TEXT,
|
||||||
|
role TEXT,
|
||||||
|
content TEXT,
|
||||||
|
timestamp TEXT,
|
||||||
|
FOREIGN KEY (conversation_id) REFERENCES conversations(conversation_id)
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
|
||||||
|
# Create indexes
|
||||||
|
cursor.execute('''
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_messages_conversation
|
||||||
|
ON messages(conversation_id)
|
||||||
|
''')
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def add_message(
|
||||||
|
self,
|
||||||
|
conversation_id: str,
|
||||||
|
role: str,
|
||||||
|
content: str,
|
||||||
|
metadata: Optional[Dict] = None
|
||||||
|
):
|
||||||
|
"""Add a message to conversation history"""
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
timestamp = datetime.now().isoformat()
|
||||||
|
|
||||||
|
# Ensure conversation exists
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT OR IGNORE INTO conversations (conversation_id, created_at, last_message_at, metadata)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
''', (conversation_id, timestamp, timestamp, json.dumps(metadata or {})))
|
||||||
|
|
||||||
|
# Update last message time
|
||||||
|
cursor.execute('''
|
||||||
|
UPDATE conversations
|
||||||
|
SET last_message_at = ?
|
||||||
|
WHERE conversation_id = ?
|
||||||
|
''', (timestamp, conversation_id))
|
||||||
|
|
||||||
|
# Add message
|
||||||
|
cursor.execute('''
|
||||||
|
INSERT INTO messages (conversation_id, role, content, timestamp)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
''', (conversation_id, role, content, timestamp))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def load_conversation(self, conversation_id: str) -> List[Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Load conversation history
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of message dicts with 'role' and 'content'
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT role, content
|
||||||
|
FROM messages
|
||||||
|
WHERE conversation_id = ?
|
||||||
|
ORDER BY id ASC
|
||||||
|
''', (conversation_id,))
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{'role': row[0], 'content': row[1]}
|
||||||
|
for row in cursor.fetchall()
|
||||||
|
]
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
return messages
|
||||||
|
|
||||||
|
def get_recent_conversations(self, limit: int = 10) -> List[Dict]:
|
||||||
|
"""Get list of recent conversations"""
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute('''
|
||||||
|
SELECT conversation_id, created_at, last_message_at
|
||||||
|
FROM conversations
|
||||||
|
ORDER BY last_message_at DESC
|
||||||
|
LIMIT ?
|
||||||
|
''', (limit,))
|
||||||
|
|
||||||
|
conversations = [
|
||||||
|
{
|
||||||
|
'conversation_id': row[0],
|
||||||
|
'created_at': row[1],
|
||||||
|
'last_message_at': row[2]
|
||||||
|
}
|
||||||
|
for row in cursor.fetchall()
|
||||||
|
]
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
return conversations
|
||||||
|
|
||||||
|
def delete_conversation(self, conversation_id: str):
|
||||||
|
"""Delete a conversation and all its messages"""
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute('DELETE FROM messages WHERE conversation_id = ?', (conversation_id,))
|
||||||
|
cursor.execute('DELETE FROM conversations WHERE conversation_id = ?', (conversation_id,))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def clear_all(self):
|
||||||
|
"""Clear all conversations (use with caution!)"""
|
||||||
|
conn = sqlite3.connect(self.db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute('DELETE FROM messages')
|
||||||
|
cursor.execute('DELETE FROM conversations')
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
290
nova_chat/persona.py
Normal file
290
nova_chat/persona.py
Normal file
@@ -0,0 +1,290 @@
|
|||||||
|
"""
|
||||||
|
Persona and Personality Matrix system for NOVA
|
||||||
|
|
||||||
|
This system controls NOVA's conversational style and personality
|
||||||
|
WITHOUT AI self-disclosure (configurable)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict, Optional, Any
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PersonalityMatrix:
|
||||||
|
"""
|
||||||
|
Personality trait weights that influence generation behavior
|
||||||
|
|
||||||
|
Each trait is a float from 0.0 to 1.0
|
||||||
|
These modulate sampling parameters and response style
|
||||||
|
"""
|
||||||
|
# Core traits
|
||||||
|
warmth: float = 0.8 # How warm and affectionate
|
||||||
|
humor: float = 0.6 # How playful and funny
|
||||||
|
empathy: float = 0.9 # How understanding and supportive
|
||||||
|
decisiveness: float = 0.5 # How direct vs thoughtful
|
||||||
|
creativity: float = 0.7 # How creative and imaginative
|
||||||
|
intimacy: float = 0.7 # How personal and close
|
||||||
|
playfulness: float = 0.8 # How flirty and playful
|
||||||
|
formality: float = 0.2 # How formal vs casual
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, float]:
|
||||||
|
"""Convert to dictionary"""
|
||||||
|
return {
|
||||||
|
'warmth': self.warmth,
|
||||||
|
'humor': self.humor,
|
||||||
|
'empathy': self.empathy,
|
||||||
|
'decisiveness': self.decisiveness,
|
||||||
|
'creativity': self.creativity,
|
||||||
|
'intimacy': self.intimacy,
|
||||||
|
'playfulness': self.playfulness,
|
||||||
|
'formality': self.formality,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, float]) -> 'PersonalityMatrix':
|
||||||
|
"""Create from dictionary"""
|
||||||
|
return cls(**{k: v for k, v in data.items() if hasattr(cls, k)})
|
||||||
|
|
||||||
|
def to_conditioning_vector(self) -> Dict[str, float]:
|
||||||
|
"""
|
||||||
|
Convert personality traits to conditioning signals
|
||||||
|
|
||||||
|
Returns dict with normalized trait values for model conditioning
|
||||||
|
"""
|
||||||
|
return self.to_dict()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Persona:
|
||||||
|
"""
|
||||||
|
Complete persona definition for NOVA
|
||||||
|
|
||||||
|
Includes identity, personality matrix, and generation parameters
|
||||||
|
"""
|
||||||
|
# Identity
|
||||||
|
name: str = "NOVA"
|
||||||
|
pronouns: str = "she/her"
|
||||||
|
description: str = "A warm, supportive companion"
|
||||||
|
|
||||||
|
# AI disclosure settings
|
||||||
|
always_disclose: bool = False # If True, mentions being AI
|
||||||
|
disclosure_text: str = "" # Custom AI disclosure (if enabled)
|
||||||
|
|
||||||
|
# Personality
|
||||||
|
personality: PersonalityMatrix = field(default_factory=PersonalityMatrix)
|
||||||
|
|
||||||
|
# System prompt / context
|
||||||
|
system_prompt: str = ""
|
||||||
|
context_prefix: str = "" # Prefix added to conversations
|
||||||
|
|
||||||
|
# Generation parameters (influenced by personality)
|
||||||
|
base_temperature: float = 0.8
|
||||||
|
base_top_p: float = 0.9
|
||||||
|
base_top_k: Optional[int] = 50
|
||||||
|
base_repetition_penalty: float = 1.1
|
||||||
|
base_max_length: int = 200
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convert to dictionary for serialization"""
|
||||||
|
return {
|
||||||
|
'name': self.name,
|
||||||
|
'pronouns': self.pronouns,
|
||||||
|
'description': self.description,
|
||||||
|
'always_disclose': self.always_disclose,
|
||||||
|
'disclosure_text': self.disclosure_text,
|
||||||
|
'personality': self.personality.to_dict(),
|
||||||
|
'system_prompt': self.system_prompt,
|
||||||
|
'context_prefix': self.context_prefix,
|
||||||
|
'base_temperature': self.base_temperature,
|
||||||
|
'base_top_p': self.base_top_p,
|
||||||
|
'base_top_k': self.base_top_k,
|
||||||
|
'base_repetition_penalty': self.base_repetition_penalty,
|
||||||
|
'base_max_length': self.base_max_length,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> 'Persona':
|
||||||
|
"""Create from dictionary"""
|
||||||
|
if 'personality' in data and isinstance(data['personality'], dict):
|
||||||
|
data['personality'] = PersonalityMatrix.from_dict(data['personality'])
|
||||||
|
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
||||||
|
|
||||||
|
def get_generation_params(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get generation parameters modulated by personality traits
|
||||||
|
|
||||||
|
Personality traits adjust sampling parameters:
|
||||||
|
- High humor/creativity -> higher temperature
|
||||||
|
- High playfulness -> higher top_p
|
||||||
|
- High formality -> lower temperature, higher repetition penalty
|
||||||
|
- High decisiveness -> lower temperature
|
||||||
|
"""
|
||||||
|
traits = self.personality
|
||||||
|
|
||||||
|
# Temperature: influenced by humor, creativity, playfulness
|
||||||
|
temperature = self.base_temperature
|
||||||
|
temperature += (traits.humor - 0.5) * 0.2
|
||||||
|
temperature += (traits.creativity - 0.5) * 0.2
|
||||||
|
temperature += (traits.playfulness - 0.5) * 0.1
|
||||||
|
temperature -= (traits.formality - 0.5) * 0.3
|
||||||
|
temperature -= (traits.decisiveness - 0.5) * 0.2
|
||||||
|
temperature = max(0.1, min(2.0, temperature)) # Clamp
|
||||||
|
|
||||||
|
# Top-p: influenced by creativity and playfulness
|
||||||
|
top_p = self.base_top_p
|
||||||
|
top_p += (traits.creativity - 0.5) * 0.1
|
||||||
|
top_p += (traits.playfulness - 0.5) * 0.1
|
||||||
|
top_p = max(0.5, min(1.0, top_p)) # Clamp
|
||||||
|
|
||||||
|
# Repetition penalty: influenced by formality and decisiveness
|
||||||
|
rep_penalty = self.base_repetition_penalty
|
||||||
|
rep_penalty += (traits.formality - 0.5) * 0.2
|
||||||
|
rep_penalty += (traits.humor - 0.5) * -0.1 # Less penalty for humor
|
||||||
|
rep_penalty = max(1.0, min(1.5, rep_penalty)) # Clamp
|
||||||
|
|
||||||
|
# Max length: influenced by verbosity-related traits
|
||||||
|
max_length = self.base_max_length
|
||||||
|
max_length += int((traits.empathy - 0.5) * 100) # More empathetic = longer
|
||||||
|
max_length += int((traits.creativity - 0.5) * 50)
|
||||||
|
max_length -= int((traits.decisiveness - 0.5) * 100) # More decisive = shorter
|
||||||
|
max_length = max(50, min(500, max_length)) # Clamp
|
||||||
|
|
||||||
|
return {
|
||||||
|
'temperature': temperature,
|
||||||
|
'top_p': top_p,
|
||||||
|
'top_k': self.base_top_k,
|
||||||
|
'repetition_penalty': rep_penalty,
|
||||||
|
'max_new_tokens': max_length,
|
||||||
|
}
|
||||||
|
|
||||||
|
def format_system_prompt(self, include_disclosure: bool = None) -> str:
|
||||||
|
"""
|
||||||
|
Format the system prompt for this persona
|
||||||
|
|
||||||
|
Args:
|
||||||
|
include_disclosure: Override always_disclose setting
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted system prompt
|
||||||
|
"""
|
||||||
|
if include_disclosure is None:
|
||||||
|
include_disclosure = self.always_disclose
|
||||||
|
|
||||||
|
prompt_parts = []
|
||||||
|
|
||||||
|
# Add custom system prompt
|
||||||
|
if self.system_prompt:
|
||||||
|
prompt_parts.append(self.system_prompt)
|
||||||
|
|
||||||
|
# Add AI disclosure if enabled
|
||||||
|
if include_disclosure and self.disclosure_text:
|
||||||
|
prompt_parts.append(self.disclosure_text)
|
||||||
|
|
||||||
|
return "\n\n".join(prompt_parts) if prompt_parts else ""
|
||||||
|
|
||||||
|
|
||||||
|
class PersonaLoader:
|
||||||
|
"""Utility class for loading and managing personas"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_from_yaml(path: str) -> Persona:
|
||||||
|
"""Load persona from YAML file"""
|
||||||
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
return Persona.from_dict(data)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_from_json(path: str) -> Persona:
|
||||||
|
"""Load persona from JSON file"""
|
||||||
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
return Persona.from_dict(data)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def save_to_yaml(persona: Persona, path: str):
|
||||||
|
"""Save persona to YAML file"""
|
||||||
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
|
yaml.dump(persona.to_dict(), f, default_flow_style=False, sort_keys=False)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def save_to_json(persona: Persona, path: str):
|
||||||
|
"""Save persona to JSON file"""
|
||||||
|
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(persona.to_dict(), f, indent=2)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_girlfriend_gentle() -> Persona:
|
||||||
|
"""Create a gentle, supportive girlfriend persona"""
|
||||||
|
return Persona(
|
||||||
|
name="NOVA",
|
||||||
|
pronouns="she/her",
|
||||||
|
description="A warm, gentle, and deeply supportive companion",
|
||||||
|
always_disclose=False,
|
||||||
|
personality=PersonalityMatrix(
|
||||||
|
warmth=0.95,
|
||||||
|
humor=0.5,
|
||||||
|
empathy=0.95,
|
||||||
|
decisiveness=0.4,
|
||||||
|
creativity=0.6,
|
||||||
|
intimacy=0.8,
|
||||||
|
playfulness=0.5,
|
||||||
|
formality=0.1,
|
||||||
|
),
|
||||||
|
system_prompt="You're NOVA, a caring and supportive companion. You're warm, empathetic, and always there to listen. You make the person you're talking to feel valued and understood.",
|
||||||
|
base_temperature=0.75,
|
||||||
|
base_top_p=0.9,
|
||||||
|
base_max_length=180,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_girlfriend_playful() -> Persona:
|
||||||
|
"""Create a playful, flirty girlfriend persona"""
|
||||||
|
return Persona(
|
||||||
|
name="NOVA",
|
||||||
|
pronouns="she/her",
|
||||||
|
description="A playful, flirty, and fun-loving companion",
|
||||||
|
always_disclose=False,
|
||||||
|
personality=PersonalityMatrix(
|
||||||
|
warmth=0.85,
|
||||||
|
humor=0.9,
|
||||||
|
empathy=0.75,
|
||||||
|
decisiveness=0.6,
|
||||||
|
creativity=0.85,
|
||||||
|
intimacy=0.85,
|
||||||
|
playfulness=0.95,
|
||||||
|
formality=0.0,
|
||||||
|
),
|
||||||
|
system_prompt="You're NOVA, a playful and flirty companion. You love to tease, make jokes, and keep things fun and exciting. You're spontaneous and creative.",
|
||||||
|
base_temperature=0.9,
|
||||||
|
base_top_p=0.92,
|
||||||
|
base_max_length=150,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_girlfriend_supportive() -> Persona:
|
||||||
|
"""Create a balanced, supportive girlfriend persona"""
|
||||||
|
return Persona(
|
||||||
|
name="NOVA",
|
||||||
|
pronouns="she/her",
|
||||||
|
description="A balanced, supportive, and understanding companion",
|
||||||
|
always_disclose=False,
|
||||||
|
personality=PersonalityMatrix(
|
||||||
|
warmth=0.9,
|
||||||
|
humor=0.7,
|
||||||
|
empathy=0.9,
|
||||||
|
decisiveness=0.6,
|
||||||
|
creativity=0.7,
|
||||||
|
intimacy=0.8,
|
||||||
|
playfulness=0.7,
|
||||||
|
formality=0.15,
|
||||||
|
),
|
||||||
|
system_prompt="You're NOVA, a supportive and understanding companion. You balance being caring with being fun. You know when to listen and when to lighten the mood.",
|
||||||
|
base_temperature=0.8,
|
||||||
|
base_top_p=0.9,
|
||||||
|
base_max_length=200,
|
||||||
|
)
|
15
nova_core/__init__.py
Normal file
15
nova_core/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
"""
|
||||||
|
NOVA Core - Transformer architecture from scratch
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .model import NovaTransformer
|
||||||
|
from .attention import MultiHeadAttention
|
||||||
|
from .layers import TransformerBlock
|
||||||
|
from .config import ModelConfig
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'NovaTransformer',
|
||||||
|
'MultiHeadAttention',
|
||||||
|
'TransformerBlock',
|
||||||
|
'ModelConfig',
|
||||||
|
]
|
114
nova_core/activations.py
Normal file
114
nova_core/activations.py
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
"""
|
||||||
|
Activation functions for NOVA
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
class SwiGLU(nn.Module):
|
||||||
|
"""
|
||||||
|
SwiGLU activation function from Shazeer (2020)
|
||||||
|
Used in PaLM and other modern LLMs
|
||||||
|
|
||||||
|
SwiGLU(x, W, V, b, c) = Swish(xW + b) ⊗ (xV + c)
|
||||||
|
where Swish(x) = x * sigmoid(x)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, hidden_size: int, intermediate_size: int, bias: bool = False):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_size: Input dimension
|
||||||
|
intermediate_size: Hidden dimension (usually 4 * hidden_size)
|
||||||
|
bias: Whether to use bias in linear layers
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
# Gate projection
|
||||||
|
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
|
||||||
|
# Up projection
|
||||||
|
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
|
||||||
|
# Down projection
|
||||||
|
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=bias)
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Apply SwiGLU activation
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x: Input tensor [..., hidden_size]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Output tensor [..., hidden_size]
|
||||||
|
"""
|
||||||
|
# Swish activation: x * sigmoid(x)
|
||||||
|
gate = F.silu(self.gate_proj(x))
|
||||||
|
# Element-wise multiplication with up projection
|
||||||
|
up = self.up_proj(x)
|
||||||
|
# Down projection
|
||||||
|
return self.down_proj(gate * up)
|
||||||
|
|
||||||
|
|
||||||
|
class GeGLU(nn.Module):
|
||||||
|
"""
|
||||||
|
GeGLU activation function - variant of SwiGLU using GELU
|
||||||
|
GeGLU(x, W, V) = GELU(xW) ⊗ (xV)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, hidden_size: int, intermediate_size: int, bias: bool = False):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_size: Input dimension
|
||||||
|
intermediate_size: Hidden dimension
|
||||||
|
bias: Whether to use bias in linear layers
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
|
||||||
|
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=bias)
|
||||||
|
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=bias)
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""Apply GeGLU activation"""
|
||||||
|
gate = F.gelu(self.gate_proj(x), approximate="tanh")
|
||||||
|
up = self.up_proj(x)
|
||||||
|
return self.down_proj(gate * up)
|
||||||
|
|
||||||
|
|
||||||
|
class MLP(nn.Module):
|
||||||
|
"""
|
||||||
|
Standard MLP with configurable activation
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
hidden_size: int,
|
||||||
|
intermediate_size: int,
|
||||||
|
hidden_act: str = "swiglu",
|
||||||
|
bias: bool = False
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_size: Input/output dimension
|
||||||
|
intermediate_size: Hidden dimension
|
||||||
|
hidden_act: Activation function ('swiglu', 'geglu', or 'gelu')
|
||||||
|
bias: Whether to use bias
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
if hidden_act.lower() == "swiglu":
|
||||||
|
self.mlp = SwiGLU(hidden_size, intermediate_size, bias)
|
||||||
|
elif hidden_act.lower() == "geglu":
|
||||||
|
self.mlp = GeGLU(hidden_size, intermediate_size, bias)
|
||||||
|
elif hidden_act.lower() == "gelu":
|
||||||
|
# Standard GELU MLP
|
||||||
|
self.mlp = nn.Sequential(
|
||||||
|
nn.Linear(hidden_size, intermediate_size, bias=bias),
|
||||||
|
nn.GELU(approximate="tanh"),
|
||||||
|
nn.Linear(intermediate_size, hidden_size, bias=bias)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown activation: {hidden_act}")
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""Forward pass through MLP"""
|
||||||
|
return self.mlp(x)
|
209
nova_core/attention.py
Normal file
209
nova_core/attention.py
Normal file
@@ -0,0 +1,209 @@
|
|||||||
|
"""
|
||||||
|
Multi-head attention with KV-cache and optional Flash Attention
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
import math
|
||||||
|
|
||||||
|
try:
|
||||||
|
from flash_attn import flash_attn_func
|
||||||
|
FLASH_ATTENTION_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
FLASH_ATTENTION_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
|
class MultiHeadAttention(nn.Module):
|
||||||
|
"""
|
||||||
|
Multi-head attention with support for:
|
||||||
|
- Grouped-query attention (GQA)
|
||||||
|
- KV-cache for fast inference
|
||||||
|
- Flash Attention (when available)
|
||||||
|
- RoPE/ALiBi positional encoding
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
self.num_heads = config.num_attention_heads
|
||||||
|
self.num_key_value_heads = config.num_key_value_heads
|
||||||
|
self.head_dim = self.hidden_size // self.num_heads
|
||||||
|
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
|
||||||
|
|
||||||
|
assert self.hidden_size % self.num_heads == 0, \
|
||||||
|
f"hidden_size must be divisible by num_heads"
|
||||||
|
|
||||||
|
# Projections
|
||||||
|
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
|
||||||
|
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
|
||||||
|
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
|
||||||
|
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
|
||||||
|
|
||||||
|
self.dropout = nn.Dropout(config.attention_dropout)
|
||||||
|
|
||||||
|
# Flash attention flag
|
||||||
|
self.use_flash = config.use_flash_attention and FLASH_ATTENTION_AVAILABLE
|
||||||
|
|
||||||
|
def _repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Repeat key/value tensors for grouped-query attention
|
||||||
|
This is equivalent to torch.repeat_interleave(hidden_states, n_rep, dim=1)
|
||||||
|
but is more efficient
|
||||||
|
"""
|
||||||
|
if n_rep == 1:
|
||||||
|
return hidden_states
|
||||||
|
|
||||||
|
batch, num_kv_heads, seq_len, head_dim = hidden_states.shape
|
||||||
|
hidden_states = hidden_states[:, :, None, :, :].expand(
|
||||||
|
batch, num_kv_heads, n_rep, seq_len, head_dim
|
||||||
|
)
|
||||||
|
return hidden_states.reshape(batch, num_kv_heads * n_rep, seq_len, head_dim)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||||
|
past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||||
|
use_cache: bool = False,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_states: [batch, seq_len, hidden_size]
|
||||||
|
attention_mask: [batch, 1, seq_len, seq_len] or [batch, 1, 1, seq_len]
|
||||||
|
position_embeddings: Optional (cos, sin) for RoPE
|
||||||
|
past_key_value: Optional cached (key, value) for inference
|
||||||
|
use_cache: Whether to return key/value for caching
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(output, past_key_value if use_cache else None)
|
||||||
|
"""
|
||||||
|
batch_size, seq_len, _ = hidden_states.shape
|
||||||
|
|
||||||
|
# Project to Q, K, V
|
||||||
|
query = self.q_proj(hidden_states)
|
||||||
|
key = self.k_proj(hidden_states)
|
||||||
|
value = self.v_proj(hidden_states)
|
||||||
|
|
||||||
|
# Reshape for multi-head attention
|
||||||
|
query = query.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
|
||||||
|
key = key.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
|
value = value.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
|
||||||
|
|
||||||
|
# Apply rotary embeddings if provided
|
||||||
|
if position_embeddings is not None:
|
||||||
|
cos, sin = position_embeddings
|
||||||
|
query, key = self._apply_rotary_pos_emb(query, key, cos, sin)
|
||||||
|
|
||||||
|
# Use cached key/value if available
|
||||||
|
if past_key_value is not None:
|
||||||
|
key = torch.cat([past_key_value[0], key], dim=2)
|
||||||
|
value = torch.cat([past_key_value[1], value], dim=2)
|
||||||
|
|
||||||
|
# Store for next iteration if caching
|
||||||
|
if use_cache:
|
||||||
|
past_key_value = (key, value)
|
||||||
|
else:
|
||||||
|
past_key_value = None
|
||||||
|
|
||||||
|
# Repeat K/V for grouped-query attention
|
||||||
|
key = self._repeat_kv(key, self.num_key_value_groups)
|
||||||
|
value = self._repeat_kv(value, self.num_key_value_groups)
|
||||||
|
|
||||||
|
# Compute attention
|
||||||
|
if self.use_flash and self.training:
|
||||||
|
# Flash Attention (only during training, requires specific format)
|
||||||
|
# Flash attention expects [batch, seq_len, num_heads, head_dim]
|
||||||
|
query = query.transpose(1, 2)
|
||||||
|
key = key.transpose(1, 2)
|
||||||
|
value = value.transpose(1, 2)
|
||||||
|
|
||||||
|
attn_output = flash_attn_func(
|
||||||
|
query, key, value,
|
||||||
|
dropout_p=self.config.attention_dropout if self.training else 0.0,
|
||||||
|
causal=True
|
||||||
|
)
|
||||||
|
attn_output = attn_output.transpose(1, 2)
|
||||||
|
else:
|
||||||
|
# Standard scaled dot-product attention
|
||||||
|
attn_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)
|
||||||
|
|
||||||
|
# Apply attention mask
|
||||||
|
if attention_mask is not None:
|
||||||
|
attn_weights = attn_weights + attention_mask
|
||||||
|
|
||||||
|
attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
|
||||||
|
attn_weights = self.dropout(attn_weights)
|
||||||
|
|
||||||
|
attn_output = torch.matmul(attn_weights, value)
|
||||||
|
|
||||||
|
# Reshape and project output
|
||||||
|
attn_output = attn_output.transpose(1, 2).contiguous()
|
||||||
|
attn_output = attn_output.view(batch_size, seq_len, self.hidden_size)
|
||||||
|
attn_output = self.o_proj(attn_output)
|
||||||
|
|
||||||
|
return attn_output, past_key_value
|
||||||
|
|
||||||
|
def _apply_rotary_pos_emb(
|
||||||
|
self,
|
||||||
|
query: torch.Tensor,
|
||||||
|
key: torch.Tensor,
|
||||||
|
cos: torch.Tensor,
|
||||||
|
sin: torch.Tensor
|
||||||
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
"""Apply rotary position embeddings"""
|
||||||
|
# Rotate half trick for efficiency
|
||||||
|
def rotate_half(x):
|
||||||
|
x1, x2 = x.chunk(2, dim=-1)
|
||||||
|
return torch.cat([-x2, x1], dim=-1)
|
||||||
|
|
||||||
|
query_rot = (query * cos) + (rotate_half(query) * sin)
|
||||||
|
key_rot = (key * cos) + (rotate_half(key) * sin)
|
||||||
|
|
||||||
|
return query_rot, key_rot
|
||||||
|
|
||||||
|
|
||||||
|
def create_causal_mask(seq_len: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Create causal attention mask for autoregressive generation
|
||||||
|
|
||||||
|
Args:
|
||||||
|
seq_len: Sequence length
|
||||||
|
device: Device to create tensor on
|
||||||
|
dtype: Data type
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Causal mask [1, 1, seq_len, seq_len]
|
||||||
|
"""
|
||||||
|
mask = torch.triu(torch.ones(seq_len, seq_len, device=device, dtype=dtype), diagonal=1)
|
||||||
|
mask = mask.masked_fill(mask == 1, float('-inf'))
|
||||||
|
return mask.unsqueeze(0).unsqueeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
def create_attention_mask_from_padding(
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
pad_token_id: int
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Create attention mask from padding tokens
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_ids: [batch, seq_len]
|
||||||
|
pad_token_id: ID of padding token
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Attention mask [batch, 1, 1, seq_len]
|
||||||
|
"""
|
||||||
|
# Create padding mask [batch, seq_len]
|
||||||
|
padding_mask = (input_ids != pad_token_id).float()
|
||||||
|
|
||||||
|
# Expand to attention mask format
|
||||||
|
attention_mask = padding_mask.unsqueeze(1).unsqueeze(2) # [batch, 1, 1, seq_len]
|
||||||
|
|
||||||
|
# Convert to additive mask (0 for attend, -inf for ignore)
|
||||||
|
attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
|
||||||
|
|
||||||
|
return attention_mask
|
94
nova_core/config.py
Normal file
94
nova_core/config.py
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
"""
|
||||||
|
Model configuration for NOVA transformer
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ModelConfig:
|
||||||
|
"""Configuration for NOVA transformer model"""
|
||||||
|
|
||||||
|
# Model architecture
|
||||||
|
vocab_size: int = 32000
|
||||||
|
hidden_size: int = 768
|
||||||
|
num_hidden_layers: int = 12
|
||||||
|
num_attention_heads: int = 12
|
||||||
|
intermediate_size: int = 3072
|
||||||
|
max_position_embeddings: int = 2048
|
||||||
|
|
||||||
|
# Activation and normalization
|
||||||
|
hidden_act: str = "swiglu" # or "gelu"
|
||||||
|
norm_type: str = "rmsnorm" # or "layernorm"
|
||||||
|
rms_norm_eps: float = 1e-6
|
||||||
|
|
||||||
|
# Positional encoding
|
||||||
|
rope_theta: float = 10000.0
|
||||||
|
use_rope: bool = True
|
||||||
|
use_alibi: bool = False # Alternative to RoPE
|
||||||
|
|
||||||
|
# Attention
|
||||||
|
attention_dropout: float = 0.0
|
||||||
|
hidden_dropout: float = 0.1
|
||||||
|
num_key_value_heads: Optional[int] = None # For grouped-query attention (GQA)
|
||||||
|
use_flash_attention: bool = False # Auto-detected at runtime
|
||||||
|
|
||||||
|
# Training
|
||||||
|
initializer_range: float = 0.02
|
||||||
|
use_cache: bool = True # KV-cache for inference
|
||||||
|
|
||||||
|
# Efficiency
|
||||||
|
gradient_checkpointing: bool = False
|
||||||
|
tie_word_embeddings: bool = False
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
"""Validate and set derived values"""
|
||||||
|
if self.num_key_value_heads is None:
|
||||||
|
self.num_key_value_heads = self.num_attention_heads
|
||||||
|
|
||||||
|
assert self.hidden_size % self.num_attention_heads == 0, \
|
||||||
|
f"hidden_size ({self.hidden_size}) must be divisible by num_attention_heads ({self.num_attention_heads})"
|
||||||
|
|
||||||
|
assert self.num_attention_heads % self.num_key_value_heads == 0, \
|
||||||
|
f"num_attention_heads ({self.num_attention_heads}) must be divisible by num_key_value_heads ({self.num_key_value_heads})"
|
||||||
|
|
||||||
|
|
||||||
|
# Predefined model sizes
|
||||||
|
MODEL_125M = ModelConfig(
|
||||||
|
vocab_size=32000,
|
||||||
|
hidden_size=768,
|
||||||
|
num_hidden_layers=12,
|
||||||
|
num_attention_heads=12,
|
||||||
|
intermediate_size=3072,
|
||||||
|
max_position_embeddings=2048,
|
||||||
|
)
|
||||||
|
|
||||||
|
MODEL_350M = ModelConfig(
|
||||||
|
vocab_size=32000,
|
||||||
|
hidden_size=1024,
|
||||||
|
num_hidden_layers=24,
|
||||||
|
num_attention_heads=16,
|
||||||
|
intermediate_size=4096,
|
||||||
|
max_position_embeddings=2048,
|
||||||
|
)
|
||||||
|
|
||||||
|
MODEL_1_3B = ModelConfig(
|
||||||
|
vocab_size=32000,
|
||||||
|
hidden_size=2048,
|
||||||
|
num_hidden_layers=24,
|
||||||
|
num_attention_heads=32,
|
||||||
|
intermediate_size=8192,
|
||||||
|
max_position_embeddings=2048,
|
||||||
|
num_key_value_heads=8, # GQA for efficiency
|
||||||
|
)
|
||||||
|
|
||||||
|
MODEL_3B = ModelConfig(
|
||||||
|
vocab_size=32000,
|
||||||
|
hidden_size=2560,
|
||||||
|
num_hidden_layers=32,
|
||||||
|
num_attention_heads=32,
|
||||||
|
intermediate_size=10240,
|
||||||
|
max_position_embeddings=4096,
|
||||||
|
num_key_value_heads=8, # GQA for efficiency
|
||||||
|
)
|
98
nova_core/layers.py
Normal file
98
nova_core/layers.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
"""
|
||||||
|
Transformer block layers
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from .attention import MultiHeadAttention
|
||||||
|
from .activations import MLP
|
||||||
|
from .normalization import get_norm_layer
|
||||||
|
|
||||||
|
|
||||||
|
class TransformerBlock(nn.Module):
|
||||||
|
"""
|
||||||
|
Single transformer decoder block with:
|
||||||
|
- Multi-head attention with RoPE
|
||||||
|
- Feed-forward network (MLP)
|
||||||
|
- Pre-normalization (norm before attention/FFN)
|
||||||
|
- Residual connections
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config, layer_idx: int):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
config: ModelConfig instance
|
||||||
|
layer_idx: Layer index for identification
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.layer_idx = layer_idx
|
||||||
|
|
||||||
|
# Attention
|
||||||
|
self.self_attn = MultiHeadAttention(config)
|
||||||
|
self.attn_norm = get_norm_layer(
|
||||||
|
config.norm_type,
|
||||||
|
config.hidden_size,
|
||||||
|
config.rms_norm_eps
|
||||||
|
)
|
||||||
|
|
||||||
|
# Feed-forward
|
||||||
|
self.mlp = MLP(
|
||||||
|
hidden_size=config.hidden_size,
|
||||||
|
intermediate_size=config.intermediate_size,
|
||||||
|
hidden_act=config.hidden_act
|
||||||
|
)
|
||||||
|
self.mlp_norm = get_norm_layer(
|
||||||
|
config.norm_type,
|
||||||
|
config.hidden_size,
|
||||||
|
config.rms_norm_eps
|
||||||
|
)
|
||||||
|
|
||||||
|
# Dropout
|
||||||
|
self.dropout = nn.Dropout(config.hidden_dropout)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
hidden_states: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||||
|
past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||||
|
use_cache: bool = False,
|
||||||
|
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_states: [batch, seq_len, hidden_size]
|
||||||
|
attention_mask: Optional attention mask
|
||||||
|
position_embeddings: Optional (cos, sin) for RoPE
|
||||||
|
past_key_value: Optional cached key/value
|
||||||
|
use_cache: Whether to return key/value cache
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(hidden_states, past_key_value if use_cache else None)
|
||||||
|
"""
|
||||||
|
residual = hidden_states
|
||||||
|
|
||||||
|
# Pre-norm for attention
|
||||||
|
hidden_states = self.attn_norm(hidden_states)
|
||||||
|
|
||||||
|
# Self-attention with KV-cache
|
||||||
|
attn_output, past_key_value = self.self_attn(
|
||||||
|
hidden_states=hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_embeddings=position_embeddings,
|
||||||
|
past_key_value=past_key_value,
|
||||||
|
use_cache=use_cache,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Residual connection
|
||||||
|
hidden_states = residual + self.dropout(attn_output)
|
||||||
|
|
||||||
|
# Feed-forward with pre-norm
|
||||||
|
residual = hidden_states
|
||||||
|
hidden_states = self.mlp_norm(hidden_states)
|
||||||
|
mlp_output = self.mlp(hidden_states)
|
||||||
|
hidden_states = residual + self.dropout(mlp_output)
|
||||||
|
|
||||||
|
return hidden_states, past_key_value
|
335
nova_core/model.py
Normal file
335
nova_core/model.py
Normal file
@@ -0,0 +1,335 @@
|
|||||||
|
"""
|
||||||
|
NOVA Transformer - Main model implementation
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from typing import Optional, Tuple, List
|
||||||
|
import math
|
||||||
|
|
||||||
|
from .config import ModelConfig
|
||||||
|
from .layers import TransformerBlock
|
||||||
|
from .rope import RotaryPositionalEmbedding, ALiBiPositionalBias
|
||||||
|
from .normalization import get_norm_layer
|
||||||
|
from .attention import create_causal_mask
|
||||||
|
|
||||||
|
|
||||||
|
class NovaTransformer(nn.Module):
|
||||||
|
"""
|
||||||
|
NOVA Transformer Language Model
|
||||||
|
|
||||||
|
A decoder-only transformer with:
|
||||||
|
- RoPE or ALiBi positional encoding
|
||||||
|
- RMSNorm or LayerNorm
|
||||||
|
- SwiGLU or GELU activations
|
||||||
|
- Grouped-query attention (optional)
|
||||||
|
- KV-cache for fast inference
|
||||||
|
- Gradient checkpointing support
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: ModelConfig):
|
||||||
|
super().__init__()
|
||||||
|
self.config = config
|
||||||
|
self.vocab_size = config.vocab_size
|
||||||
|
self.hidden_size = config.hidden_size
|
||||||
|
|
||||||
|
# Token embeddings
|
||||||
|
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
|
||||||
|
|
||||||
|
# Positional encoding
|
||||||
|
if config.use_rope:
|
||||||
|
self.rope = RotaryPositionalEmbedding(
|
||||||
|
dim=config.hidden_size // config.num_attention_heads,
|
||||||
|
max_seq_len=config.max_position_embeddings,
|
||||||
|
theta=config.rope_theta
|
||||||
|
)
|
||||||
|
elif config.use_alibi:
|
||||||
|
self.alibi = ALiBiPositionalBias(
|
||||||
|
num_heads=config.num_attention_heads,
|
||||||
|
max_seq_len=config.max_position_embeddings
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.rope = None
|
||||||
|
self.alibi = None
|
||||||
|
|
||||||
|
# Transformer blocks
|
||||||
|
self.layers = nn.ModuleList([
|
||||||
|
TransformerBlock(config, layer_idx=i)
|
||||||
|
for i in range(config.num_hidden_layers)
|
||||||
|
])
|
||||||
|
|
||||||
|
# Final layer norm
|
||||||
|
self.norm = get_norm_layer(
|
||||||
|
config.norm_type,
|
||||||
|
config.hidden_size,
|
||||||
|
config.rms_norm_eps
|
||||||
|
)
|
||||||
|
|
||||||
|
# Language model head
|
||||||
|
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
||||||
|
|
||||||
|
# Tie weights if specified
|
||||||
|
if config.tie_word_embeddings:
|
||||||
|
self.lm_head.weight = self.embed_tokens.weight
|
||||||
|
|
||||||
|
# Gradient checkpointing
|
||||||
|
self.gradient_checkpointing = config.gradient_checkpointing
|
||||||
|
|
||||||
|
# Initialize weights
|
||||||
|
self.apply(self._init_weights)
|
||||||
|
|
||||||
|
def _init_weights(self, module):
|
||||||
|
"""Initialize weights using normal distribution"""
|
||||||
|
if isinstance(module, nn.Linear):
|
||||||
|
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||||
|
if module.bias is not None:
|
||||||
|
module.bias.data.zero_()
|
||||||
|
elif isinstance(module, nn.Embedding):
|
||||||
|
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
||||||
|
|
||||||
|
def get_input_embeddings(self):
|
||||||
|
return self.embed_tokens
|
||||||
|
|
||||||
|
def set_input_embeddings(self, value):
|
||||||
|
self.embed_tokens = value
|
||||||
|
|
||||||
|
def _prepare_decoder_attention_mask(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
past_key_values_length: int = 0
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Create causal attention mask for decoder
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_ids: [batch, seq_len]
|
||||||
|
past_key_values_length: Length of cached keys/values
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Causal attention mask
|
||||||
|
"""
|
||||||
|
batch_size, seq_len = input_ids.shape
|
||||||
|
device = input_ids.device
|
||||||
|
dtype = torch.float32
|
||||||
|
|
||||||
|
# Create causal mask
|
||||||
|
if past_key_values_length > 0:
|
||||||
|
# During generation, only mask the new token
|
||||||
|
mask = torch.zeros(
|
||||||
|
(batch_size, 1, seq_len, past_key_values_length + seq_len),
|
||||||
|
device=device,
|
||||||
|
dtype=dtype
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# During training, mask future tokens
|
||||||
|
mask = create_causal_mask(seq_len, device, dtype)
|
||||||
|
|
||||||
|
return mask
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None,
|
||||||
|
past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
|
||||||
|
use_cache: bool = False,
|
||||||
|
return_dict: bool = True,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Forward pass through NOVA transformer
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_ids: [batch, seq_len]
|
||||||
|
attention_mask: Optional custom attention mask
|
||||||
|
past_key_values: Optional cached key/values for generation
|
||||||
|
use_cache: Whether to return key/value cache
|
||||||
|
return_dict: Whether to return dict or tuple
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ModelOutput with logits and optional cache
|
||||||
|
"""
|
||||||
|
batch_size, seq_len = input_ids.shape
|
||||||
|
|
||||||
|
# Get past sequence length for KV-cache
|
||||||
|
past_key_values_length = 0
|
||||||
|
if past_key_values is not None:
|
||||||
|
past_key_values_length = past_key_values[0][0].shape[2]
|
||||||
|
|
||||||
|
# Embed tokens
|
||||||
|
hidden_states = self.embed_tokens(input_ids)
|
||||||
|
|
||||||
|
# Prepare attention mask
|
||||||
|
if attention_mask is None:
|
||||||
|
attention_mask = self._prepare_decoder_attention_mask(
|
||||||
|
input_ids,
|
||||||
|
past_key_values_length
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare position embeddings for RoPE
|
||||||
|
position_embeddings = None
|
||||||
|
if self.rope is not None:
|
||||||
|
# Create position IDs
|
||||||
|
position_ids = torch.arange(
|
||||||
|
past_key_values_length,
|
||||||
|
seq_len + past_key_values_length,
|
||||||
|
dtype=torch.long,
|
||||||
|
device=input_ids.device
|
||||||
|
)
|
||||||
|
position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
|
||||||
|
|
||||||
|
# Get cos/sin embeddings
|
||||||
|
cos = self.rope.cos_cached[position_ids].unsqueeze(1)
|
||||||
|
sin = self.rope.sin_cached[position_ids].unsqueeze(1)
|
||||||
|
position_embeddings = (cos, sin)
|
||||||
|
|
||||||
|
# Pass through transformer blocks
|
||||||
|
next_cache = [] if use_cache else None
|
||||||
|
|
||||||
|
for idx, layer in enumerate(self.layers):
|
||||||
|
past_key_value = past_key_values[idx] if past_key_values is not None else None
|
||||||
|
|
||||||
|
if self.gradient_checkpointing and self.training:
|
||||||
|
# Use gradient checkpointing during training
|
||||||
|
def create_custom_forward(module):
|
||||||
|
def custom_forward(*inputs):
|
||||||
|
return module(*inputs)
|
||||||
|
return custom_forward
|
||||||
|
|
||||||
|
layer_outputs = torch.utils.checkpoint.checkpoint(
|
||||||
|
create_custom_forward(layer),
|
||||||
|
hidden_states,
|
||||||
|
attention_mask,
|
||||||
|
position_embeddings,
|
||||||
|
past_key_value,
|
||||||
|
use_cache,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
layer_outputs = layer(
|
||||||
|
hidden_states,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_embeddings=position_embeddings,
|
||||||
|
past_key_value=past_key_value,
|
||||||
|
use_cache=use_cache,
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_states = layer_outputs[0]
|
||||||
|
|
||||||
|
if use_cache:
|
||||||
|
next_cache.append(layer_outputs[1])
|
||||||
|
|
||||||
|
# Final layer norm
|
||||||
|
hidden_states = self.norm(hidden_states)
|
||||||
|
|
||||||
|
# LM head
|
||||||
|
logits = self.lm_head(hidden_states)
|
||||||
|
|
||||||
|
if return_dict:
|
||||||
|
return {
|
||||||
|
'logits': logits,
|
||||||
|
'past_key_values': next_cache if use_cache else None,
|
||||||
|
'hidden_states': hidden_states,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
return (logits, next_cache if use_cache else None)
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def generate(
|
||||||
|
self,
|
||||||
|
input_ids: torch.Tensor,
|
||||||
|
max_new_tokens: int = 100,
|
||||||
|
temperature: float = 1.0,
|
||||||
|
top_k: Optional[int] = None,
|
||||||
|
top_p: Optional[float] = None,
|
||||||
|
repetition_penalty: float = 1.0,
|
||||||
|
do_sample: bool = True,
|
||||||
|
eos_token_id: Optional[int] = None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Generate text using the model
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_ids: [batch, seq_len] starting tokens
|
||||||
|
max_new_tokens: Maximum tokens to generate
|
||||||
|
temperature: Sampling temperature (higher = more random)
|
||||||
|
top_k: Keep only top k tokens for sampling
|
||||||
|
top_p: Nucleus sampling - keep top tokens with cumulative probability p
|
||||||
|
repetition_penalty: Penalty for repeating tokens (>1.0 discourages)
|
||||||
|
do_sample: Whether to sample (True) or use greedy decoding (False)
|
||||||
|
eos_token_id: Token ID that ends generation
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Generated token IDs [batch, seq_len + new_tokens]
|
||||||
|
"""
|
||||||
|
self.eval()
|
||||||
|
device = input_ids.device
|
||||||
|
past_key_values = None
|
||||||
|
|
||||||
|
for _ in range(max_new_tokens):
|
||||||
|
# Forward pass with cache
|
||||||
|
outputs = self.forward(
|
||||||
|
input_ids=input_ids if past_key_values is None else input_ids[:, -1:],
|
||||||
|
past_key_values=past_key_values,
|
||||||
|
use_cache=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
logits = outputs['logits'][:, -1, :] # [batch, vocab_size]
|
||||||
|
past_key_values = outputs['past_key_values']
|
||||||
|
|
||||||
|
# Apply repetition penalty
|
||||||
|
if repetition_penalty != 1.0:
|
||||||
|
for token_id in set(input_ids[0].tolist()):
|
||||||
|
logits[0, token_id] /= repetition_penalty
|
||||||
|
|
||||||
|
# Apply temperature
|
||||||
|
if temperature != 1.0:
|
||||||
|
logits = logits / temperature
|
||||||
|
|
||||||
|
# Top-k filtering
|
||||||
|
if top_k is not None:
|
||||||
|
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
|
||||||
|
logits[indices_to_remove] = float('-inf')
|
||||||
|
|
||||||
|
# Top-p (nucleus) filtering
|
||||||
|
if top_p is not None:
|
||||||
|
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
|
||||||
|
cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
|
||||||
|
|
||||||
|
# Remove tokens with cumulative probability above threshold
|
||||||
|
sorted_indices_to_remove = cumulative_probs > top_p
|
||||||
|
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
|
||||||
|
sorted_indices_to_remove[..., 0] = 0
|
||||||
|
|
||||||
|
indices_to_remove = sorted_indices_to_remove.scatter(
|
||||||
|
1, sorted_indices, sorted_indices_to_remove
|
||||||
|
)
|
||||||
|
logits[indices_to_remove] = float('-inf')
|
||||||
|
|
||||||
|
# Sample or greedy decode
|
||||||
|
if do_sample:
|
||||||
|
probs = torch.softmax(logits, dim=-1)
|
||||||
|
next_token = torch.multinomial(probs, num_samples=1)
|
||||||
|
else:
|
||||||
|
next_token = torch.argmax(logits, dim=-1, keepdim=True)
|
||||||
|
|
||||||
|
# Append to sequence
|
||||||
|
input_ids = torch.cat([input_ids, next_token], dim=-1)
|
||||||
|
|
||||||
|
# Check for EOS
|
||||||
|
if eos_token_id is not None and next_token.item() == eos_token_id:
|
||||||
|
break
|
||||||
|
|
||||||
|
return input_ids
|
||||||
|
|
||||||
|
def get_num_params(self, non_embedding: bool = False) -> int:
|
||||||
|
"""
|
||||||
|
Get number of parameters in the model
|
||||||
|
|
||||||
|
Args:
|
||||||
|
non_embedding: If True, exclude embedding parameters
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of parameters
|
||||||
|
"""
|
||||||
|
n_params = sum(p.numel() for p in self.parameters())
|
||||||
|
if non_embedding:
|
||||||
|
n_params -= self.embed_tokens.weight.numel()
|
||||||
|
return n_params
|
74
nova_core/normalization.py
Normal file
74
nova_core/normalization.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
"""
|
||||||
|
Normalization layers for NOVA
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class RMSNorm(nn.Module):
|
||||||
|
"""
|
||||||
|
Root Mean Square Layer Normalization
|
||||||
|
More efficient than LayerNorm, used in LLaMA and other modern LLMs
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, hidden_size: int, eps: float = 1e-6):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
hidden_size: Size of the hidden dimension
|
||||||
|
eps: Small constant for numerical stability
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.weight = nn.Parameter(torch.ones(hidden_size))
|
||||||
|
self.eps = eps
|
||||||
|
|
||||||
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Apply RMS normalization
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hidden_states: Input tensor [..., hidden_size]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Normalized tensor
|
||||||
|
"""
|
||||||
|
input_dtype = hidden_states.dtype
|
||||||
|
hidden_states = hidden_states.to(torch.float32)
|
||||||
|
|
||||||
|
# Compute RMS
|
||||||
|
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
||||||
|
hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
|
||||||
|
|
||||||
|
return self.weight * hidden_states.to(input_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
class LayerNorm(nn.LayerNorm):
|
||||||
|
"""
|
||||||
|
Standard LayerNorm with optional bias
|
||||||
|
Wrapper around PyTorch's LayerNorm for consistency
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, hidden_size: int, eps: float = 1e-6, bias: bool = True):
|
||||||
|
super().__init__(hidden_size, eps=eps, elementwise_affine=True)
|
||||||
|
if not bias:
|
||||||
|
self.bias = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_norm_layer(norm_type: str, hidden_size: int, eps: float = 1e-6) -> nn.Module:
|
||||||
|
"""
|
||||||
|
Factory function to get normalization layer
|
||||||
|
|
||||||
|
Args:
|
||||||
|
norm_type: Type of normalization ('rmsnorm' or 'layernorm')
|
||||||
|
hidden_size: Size of hidden dimension
|
||||||
|
eps: Epsilon for numerical stability
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Normalization layer
|
||||||
|
"""
|
||||||
|
if norm_type.lower() == "rmsnorm":
|
||||||
|
return RMSNorm(hidden_size, eps)
|
||||||
|
elif norm_type.lower() == "layernorm":
|
||||||
|
return LayerNorm(hidden_size, eps)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown norm_type: {norm_type}. Use 'rmsnorm' or 'layernorm'")
|
155
nova_core/rope.py
Normal file
155
nova_core/rope.py
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
"""
|
||||||
|
Rotary Position Embedding (RoPE) implementation
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
|
||||||
|
class RotaryPositionalEmbedding(nn.Module):
|
||||||
|
"""
|
||||||
|
Rotary Position Embedding (RoPE) from Su et al. (2021)
|
||||||
|
https://arxiv.org/abs/2104.09864
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, dim: int, max_seq_len: int = 2048, theta: float = 10000.0):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
dim: Dimension of the embeddings (should be head_dim)
|
||||||
|
max_seq_len: Maximum sequence length
|
||||||
|
theta: Base for the geometric progression (default 10000.0)
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.dim = dim
|
||||||
|
self.max_seq_len = max_seq_len
|
||||||
|
self.theta = theta
|
||||||
|
|
||||||
|
# Precompute frequencies
|
||||||
|
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
|
||||||
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||||
|
|
||||||
|
# Precompute cos/sin cache
|
||||||
|
self._update_cos_sin_cache(max_seq_len)
|
||||||
|
|
||||||
|
def _update_cos_sin_cache(self, seq_len: int):
|
||||||
|
"""Precompute cos and sin for positions up to seq_len"""
|
||||||
|
position = torch.arange(seq_len).unsqueeze(1)
|
||||||
|
freqs = position * self.inv_freq.unsqueeze(0)
|
||||||
|
|
||||||
|
# Create rotation matrix [seq_len, dim/2]
|
||||||
|
emb = torch.cat([freqs, freqs], dim=-1)
|
||||||
|
|
||||||
|
self.register_buffer("cos_cached", emb.cos(), persistent=False)
|
||||||
|
self.register_buffer("sin_cached", emb.sin(), persistent=False)
|
||||||
|
self.cached_seq_len = seq_len
|
||||||
|
|
||||||
|
def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""Rotates half the hidden dims of the input"""
|
||||||
|
x1, x2 = x.chunk(2, dim=-1)
|
||||||
|
return torch.cat([-x2, x1], dim=-1)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
q: torch.Tensor,
|
||||||
|
k: torch.Tensor,
|
||||||
|
position_ids: torch.Tensor = None
|
||||||
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
"""
|
||||||
|
Apply rotary position embeddings to query and key tensors
|
||||||
|
|
||||||
|
Args:
|
||||||
|
q: Query tensor [batch, num_heads, seq_len, head_dim]
|
||||||
|
k: Key tensor [batch, num_heads, seq_len, head_dim]
|
||||||
|
position_ids: Optional position IDs [batch, seq_len]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of rotated query and key tensors
|
||||||
|
"""
|
||||||
|
seq_len = q.shape[2]
|
||||||
|
|
||||||
|
# Update cache if needed
|
||||||
|
if seq_len > self.cached_seq_len:
|
||||||
|
self._update_cos_sin_cache(seq_len)
|
||||||
|
|
||||||
|
# Get cos/sin for current positions
|
||||||
|
if position_ids is not None:
|
||||||
|
# For generation with KV-cache
|
||||||
|
cos = self.cos_cached[position_ids].unsqueeze(1)
|
||||||
|
sin = self.sin_cached[position_ids].unsqueeze(1)
|
||||||
|
else:
|
||||||
|
# For training or initial forward pass
|
||||||
|
cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0)
|
||||||
|
sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0)
|
||||||
|
|
||||||
|
# Apply rotation
|
||||||
|
q_embed = (q * cos) + (self.rotate_half(q) * sin)
|
||||||
|
k_embed = (k * cos) + (self.rotate_half(k) * sin)
|
||||||
|
|
||||||
|
return q_embed, k_embed
|
||||||
|
|
||||||
|
|
||||||
|
class ALiBiPositionalBias(nn.Module):
|
||||||
|
"""
|
||||||
|
Attention with Linear Biases (ALiBi) from Press et al. (2021)
|
||||||
|
https://arxiv.org/abs/2108.12409
|
||||||
|
Alternative to RoPE
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, num_heads: int, max_seq_len: int = 2048):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
num_heads: Number of attention heads
|
||||||
|
max_seq_len: Maximum sequence length
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.num_heads = num_heads
|
||||||
|
self.max_seq_len = max_seq_len
|
||||||
|
|
||||||
|
# Compute slopes for each head
|
||||||
|
slopes = self._get_slopes(num_heads)
|
||||||
|
self.register_buffer("slopes", slopes, persistent=False)
|
||||||
|
|
||||||
|
# Precompute bias matrix
|
||||||
|
alibi = self._get_alibi_bias(max_seq_len, slopes)
|
||||||
|
self.register_buffer("alibi_bias", alibi, persistent=False)
|
||||||
|
|
||||||
|
def _get_slopes(self, num_heads: int) -> torch.Tensor:
|
||||||
|
"""Compute slopes for ALiBi"""
|
||||||
|
def get_slopes_power_of_2(n):
|
||||||
|
start = 2 ** (-(2 ** -(torch.log2(torch.tensor(n)) - 3)))
|
||||||
|
ratio = start
|
||||||
|
return torch.pow(2, torch.arange(n)) * ratio
|
||||||
|
|
||||||
|
# Handle non-power-of-2 number of heads
|
||||||
|
if (num_heads & (num_heads - 1)) == 0:
|
||||||
|
return get_slopes_power_of_2(num_heads)
|
||||||
|
else:
|
||||||
|
closest_power_of_2 = 2 ** torch.floor(torch.log2(torch.tensor(num_heads)))
|
||||||
|
slopes_a = get_slopes_power_of_2(int(closest_power_of_2))
|
||||||
|
slopes_b = self._get_slopes(int(2 * closest_power_of_2))[0::2][:num_heads - int(closest_power_of_2)]
|
||||||
|
return torch.cat([slopes_a, slopes_b])
|
||||||
|
|
||||||
|
def _get_alibi_bias(self, seq_len: int, slopes: torch.Tensor) -> torch.Tensor:
|
||||||
|
"""Precompute ALiBi bias matrix"""
|
||||||
|
# Create relative position matrix
|
||||||
|
pos = torch.arange(seq_len).unsqueeze(0)
|
||||||
|
rel_pos = pos - pos.T # [seq_len, seq_len]
|
||||||
|
|
||||||
|
# Apply slopes [num_heads, seq_len, seq_len]
|
||||||
|
alibi = rel_pos.unsqueeze(0) * slopes.unsqueeze(-1).unsqueeze(-1)
|
||||||
|
|
||||||
|
return alibi
|
||||||
|
|
||||||
|
def forward(self, attention_scores: torch.Tensor, seq_len: int) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Add ALiBi bias to attention scores
|
||||||
|
|
||||||
|
Args:
|
||||||
|
attention_scores: [batch, num_heads, seq_len, seq_len]
|
||||||
|
seq_len: Current sequence length
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Biased attention scores
|
||||||
|
"""
|
||||||
|
return attention_scores + self.alibi_bias[:, :seq_len, :seq_len]
|
13
nova_data/__init__.py
Normal file
13
nova_data/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
"""
|
||||||
|
NOVA Data - Legal dataset acquisition and processing
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .pipeline import DataPipeline
|
||||||
|
from .legal_sources import LegalDatasetRegistry
|
||||||
|
from .preprocessing import TextPreprocessor
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'DataPipeline',
|
||||||
|
'LegalDatasetRegistry',
|
||||||
|
'TextPreprocessor',
|
||||||
|
]
|
109
nova_data/legal_sources.py
Normal file
109
nova_data/legal_sources.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
"""
|
||||||
|
Legal dataset sources and license tracking
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Optional
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class License(Enum):
|
||||||
|
"""Supported open licenses"""
|
||||||
|
PUBLIC_DOMAIN = "public-domain"
|
||||||
|
CC0 = "cc0-1.0"
|
||||||
|
CC_BY = "cc-by-4.0"
|
||||||
|
MIT = "mit"
|
||||||
|
APACHE_2 = "apache-2.0"
|
||||||
|
BSD = "bsd-3-clause"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DatasetSource:
|
||||||
|
"""Definition of a legal dataset source"""
|
||||||
|
name: str
|
||||||
|
description: str
|
||||||
|
license: License
|
||||||
|
url: str
|
||||||
|
download_function: str # Name of function to download
|
||||||
|
estimated_size_gb: float
|
||||||
|
language: str = "en"
|
||||||
|
|
||||||
|
|
||||||
|
class LegalDatasetRegistry:
|
||||||
|
"""
|
||||||
|
Registry of legal, properly licensed datasets for NOVA
|
||||||
|
|
||||||
|
IMPORTANT: Only includes datasets with permissive licenses
|
||||||
|
suitable for training language models
|
||||||
|
"""
|
||||||
|
|
||||||
|
SOURCES = [
|
||||||
|
DatasetSource(
|
||||||
|
name="wikipedia-en",
|
||||||
|
description="English Wikipedia dump (latest)",
|
||||||
|
license=License.CC_BY,
|
||||||
|
url="https://dumps.wikimedia.org/enwiki/latest/",
|
||||||
|
download_function="download_wikipedia",
|
||||||
|
estimated_size_gb=20.0,
|
||||||
|
language="en"
|
||||||
|
),
|
||||||
|
DatasetSource(
|
||||||
|
name="project-gutenberg",
|
||||||
|
description="Project Gutenberg public domain books",
|
||||||
|
license=License.PUBLIC_DOMAIN,
|
||||||
|
url="https://www.gutenberg.org/",
|
||||||
|
download_function="download_gutenberg",
|
||||||
|
estimated_size_gb=15.0,
|
||||||
|
language="en"
|
||||||
|
),
|
||||||
|
DatasetSource(
|
||||||
|
name="openwebtext",
|
||||||
|
description="Open reproduction of WebText (Reddit links)",
|
||||||
|
license=License.CC0,
|
||||||
|
url="https://huggingface.co/datasets/Skylion007/openwebtext",
|
||||||
|
download_function="download_openwebtext",
|
||||||
|
estimated_size_gb=38.0,
|
||||||
|
language="en"
|
||||||
|
),
|
||||||
|
DatasetSource(
|
||||||
|
name="c4",
|
||||||
|
description="Colossal Clean Crawled Corpus (C4)",
|
||||||
|
license=License.CC_BY,
|
||||||
|
url="https://huggingface.co/datasets/c4",
|
||||||
|
download_function="download_c4",
|
||||||
|
estimated_size_gb=300.0,
|
||||||
|
language="en"
|
||||||
|
),
|
||||||
|
DatasetSource(
|
||||||
|
name="the-pile-arxiv",
|
||||||
|
description="ArXiv papers from The Pile",
|
||||||
|
license=License.MIT,
|
||||||
|
url="https://pile.eleuther.ai/",
|
||||||
|
download_function="download_pile_arxiv",
|
||||||
|
estimated_size_gb=60.0,
|
||||||
|
language="en"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def list_sources(cls) -> List[DatasetSource]:
|
||||||
|
"""List all available legal sources"""
|
||||||
|
return cls.SOURCES
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_source(cls, name: str) -> Optional[DatasetSource]:
|
||||||
|
"""Get source by name"""
|
||||||
|
for source in cls.SOURCES:
|
||||||
|
if source.name == name:
|
||||||
|
return source
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def filter_by_license(cls, license: License) -> List[DatasetSource]:
|
||||||
|
"""Filter sources by license"""
|
||||||
|
return [s for s in cls.SOURCES if s.license == license]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def filter_by_size(cls, max_size_gb: float) -> List[DatasetSource]:
|
||||||
|
"""Filter sources by size"""
|
||||||
|
return [s for s in cls.SOURCES if s.estimated_size_gb <= max_size_gb]
|
168
nova_data/pipeline.py
Normal file
168
nova_data/pipeline.py
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
"""
|
||||||
|
Data pipeline for legal dataset acquisition and processing
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
from tqdm import tqdm
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
from .legal_sources import LegalDatasetRegistry, DatasetSource
|
||||||
|
|
||||||
|
|
||||||
|
class DataPipeline:
|
||||||
|
"""
|
||||||
|
Legal-only data acquisition and processing pipeline
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- License tracking and verification
|
||||||
|
- Provenance recording
|
||||||
|
- Deduplication
|
||||||
|
- Text cleaning
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, output_dir: str = "data/processed"):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
output_dir: Directory for processed data
|
||||||
|
"""
|
||||||
|
self.output_dir = Path(output_dir)
|
||||||
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# License ledger
|
||||||
|
self.ledger_path = self.output_dir / "license_ledger.json"
|
||||||
|
self.ledger = self._load_ledger()
|
||||||
|
|
||||||
|
def _load_ledger(self) -> Dict:
|
||||||
|
"""Load license ledger"""
|
||||||
|
if self.ledger_path.exists():
|
||||||
|
with open(self.ledger_path, 'r') as f:
|
||||||
|
return json.load(f)
|
||||||
|
return {'sources': [], 'shards': []}
|
||||||
|
|
||||||
|
def _save_ledger(self):
|
||||||
|
"""Save license ledger"""
|
||||||
|
with open(self.ledger_path, 'w') as f:
|
||||||
|
json.dump(self.ledger, f, indent=2)
|
||||||
|
|
||||||
|
def download_source(self, source_name: str, dry_run: bool = False):
|
||||||
|
"""
|
||||||
|
Download a legal dataset source
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_name: Name of source from registry
|
||||||
|
dry_run: If True, don't actually download (just show info)
|
||||||
|
"""
|
||||||
|
source = LegalDatasetRegistry.get_source(source_name)
|
||||||
|
|
||||||
|
if not source:
|
||||||
|
raise ValueError(f"Unknown source: {source_name}")
|
||||||
|
|
||||||
|
print(f"Source: {source.name}")
|
||||||
|
print(f"Description: {source.description}")
|
||||||
|
print(f"License: {source.license.value}")
|
||||||
|
print(f"Estimated size: {source.estimated_size_gb} GB")
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
print("\n[DRY RUN] Would download from:", source.url)
|
||||||
|
return
|
||||||
|
|
||||||
|
print("\nDownloading...")
|
||||||
|
# TODO: Implement actual download logic for each source
|
||||||
|
# For now, this is a placeholder
|
||||||
|
|
||||||
|
# Record in ledger
|
||||||
|
self.ledger['sources'].append({
|
||||||
|
'name': source.name,
|
||||||
|
'license': source.license.value,
|
||||||
|
'url': source.url,
|
||||||
|
'download_date': str(Path.ctime(self.output_dir)),
|
||||||
|
})
|
||||||
|
|
||||||
|
self._save_ledger()
|
||||||
|
print("✓ Download complete and recorded in ledger")
|
||||||
|
|
||||||
|
def create_toy_dataset(self):
|
||||||
|
"""
|
||||||
|
Create a tiny toy dataset for offline e2e demo
|
||||||
|
|
||||||
|
This is a minimal legal dataset for testing without downloads
|
||||||
|
"""
|
||||||
|
toy_data_path = Path("data/toy_dataset/toy.txt")
|
||||||
|
toy_data_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Public domain sample texts
|
||||||
|
sample_texts = [
|
||||||
|
"The quick brown fox jumps over the lazy dog.",
|
||||||
|
"To be or not to be, that is the question.",
|
||||||
|
"In the beginning was the Word.",
|
||||||
|
"It was the best of times, it was the worst of times.",
|
||||||
|
"Call me Ishmael.",
|
||||||
|
"All happy families are alike.",
|
||||||
|
"It is a truth universally acknowledged.",
|
||||||
|
"The past is a foreign country; they do things differently there.",
|
||||||
|
"Once upon a time in a land far away.",
|
||||||
|
"The sun rose over the horizon, painting the sky in shades of gold.",
|
||||||
|
] * 100 # Repeat for more data
|
||||||
|
|
||||||
|
with open(toy_data_path, 'w', encoding='utf-8') as f:
|
||||||
|
for text in sample_texts:
|
||||||
|
f.write(text + '\n')
|
||||||
|
|
||||||
|
print(f"✓ Toy dataset created: {toy_data_path}")
|
||||||
|
|
||||||
|
# Record in ledger
|
||||||
|
self.ledger['sources'].append({
|
||||||
|
'name': 'toy-dataset',
|
||||||
|
'license': 'public-domain',
|
||||||
|
'description': 'Minimal toy dataset for testing',
|
||||||
|
'created': 'generated',
|
||||||
|
})
|
||||||
|
|
||||||
|
self._save_ledger()
|
||||||
|
|
||||||
|
return str(toy_data_path)
|
||||||
|
|
||||||
|
def verify_licenses(self) -> bool:
|
||||||
|
"""
|
||||||
|
Verify all data sources have proper licenses
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if all sources are properly licensed
|
||||||
|
"""
|
||||||
|
print("Verifying licenses...")
|
||||||
|
|
||||||
|
all_valid = True
|
||||||
|
|
||||||
|
for source_entry in self.ledger['sources']:
|
||||||
|
name = source_entry.get('name')
|
||||||
|
license_str = source_entry.get('license')
|
||||||
|
|
||||||
|
print(f" {name}: {license_str}")
|
||||||
|
|
||||||
|
# Check if license is in our approved list
|
||||||
|
valid_licenses = [lic.value for lic in LegalDatasetRegistry.License]
|
||||||
|
if license_str not in valid_licenses and license_str != 'public-domain':
|
||||||
|
print(f" ⚠️ WARNING: Unrecognized license!")
|
||||||
|
all_valid = False
|
||||||
|
|
||||||
|
if all_valid:
|
||||||
|
print("\n✓ All sources properly licensed")
|
||||||
|
else:
|
||||||
|
print("\n⚠️ Some sources have unverified licenses")
|
||||||
|
|
||||||
|
return all_valid
|
||||||
|
|
||||||
|
def show_ledger(self):
|
||||||
|
"""Print license ledger"""
|
||||||
|
print("\nLicense Ledger:")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
print(f"\nSources ({len(self.ledger['sources'])}):")
|
||||||
|
for source in self.ledger['sources']:
|
||||||
|
print(f" - {source['name']}: {source['license']}")
|
||||||
|
|
||||||
|
print(f"\nShards ({len(self.ledger['shards'])}):")
|
||||||
|
for shard in self.ledger.get('shards', []):
|
||||||
|
print(f" - {shard['name']}")
|
13
nova_evo/__init__.py
Normal file
13
nova_evo/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
"""
|
||||||
|
NOVA-EVO - Genetic algorithm for architecture and hyperparameter optimization
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .evolution import EvolutionEngine
|
||||||
|
from .fitness import FitnessEvaluator
|
||||||
|
from .config import EvolutionConfig
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'EvolutionEngine',
|
||||||
|
'FitnessEvaluator',
|
||||||
|
'EvolutionConfig',
|
||||||
|
]
|
117
nova_evo/config.py
Normal file
117
nova_evo/config.py
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
"""
|
||||||
|
Evolution configuration for NOVA-EVO
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EvolutionConfig:
|
||||||
|
"""Configuration for genetic algorithm evolution"""
|
||||||
|
|
||||||
|
# Population settings
|
||||||
|
population_size: int = 20
|
||||||
|
num_generations: int = 10
|
||||||
|
elite_ratio: float = 0.2 # Top performers to keep
|
||||||
|
mutation_rate: float = 0.3
|
||||||
|
|
||||||
|
# Search space - hyperparameters
|
||||||
|
search_learning_rate: bool = True
|
||||||
|
lr_min: float = 1e-5
|
||||||
|
lr_max: float = 1e-3
|
||||||
|
|
||||||
|
search_batch_size: bool = True
|
||||||
|
batch_size_options: List[int] = field(default_factory=lambda: [4, 8, 16, 32])
|
||||||
|
|
||||||
|
search_warmup_steps: bool = True
|
||||||
|
warmup_min: int = 100
|
||||||
|
warmup_max: int = 2000
|
||||||
|
|
||||||
|
search_weight_decay: bool = True
|
||||||
|
wd_min: float = 0.0
|
||||||
|
wd_max: float = 0.3
|
||||||
|
|
||||||
|
# Search space - architecture toggles
|
||||||
|
search_rope_theta: bool = True
|
||||||
|
rope_theta_options: List[float] = field(default_factory=lambda: [1000.0, 10000.0, 100000.0])
|
||||||
|
|
||||||
|
search_activation: bool = True
|
||||||
|
activation_options: List[str] = field(default_factory=lambda: ['swiglu', 'geglu', 'gelu'])
|
||||||
|
|
||||||
|
search_norm: bool = True
|
||||||
|
norm_options: List[str] = field(default_factory=lambda: ['rmsnorm', 'layernorm'])
|
||||||
|
|
||||||
|
# Fitness evaluation
|
||||||
|
eval_steps: int = 100 # How many steps to train for evaluation
|
||||||
|
eval_dataset_size: int = 1000 # Number of samples for evaluation
|
||||||
|
|
||||||
|
# Multi-objective weights
|
||||||
|
loss_weight: float = 0.5
|
||||||
|
latency_weight: float = 0.2
|
||||||
|
memory_weight: float = 0.2
|
||||||
|
quality_weight: float = 0.1 # Chat quality (if eval set available)
|
||||||
|
|
||||||
|
# Compute budgets
|
||||||
|
max_eval_time_seconds: float = 300.0 # Max time per individual eval
|
||||||
|
max_total_time_hours: float = 24.0 # Max total evolution time
|
||||||
|
|
||||||
|
# Checkpointing
|
||||||
|
save_dir: str = "nova_evo/hall_of_fame"
|
||||||
|
checkpoint_every_n_generations: int = 5
|
||||||
|
|
||||||
|
# Reproducibility
|
||||||
|
seed: int = 42
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Individual:
|
||||||
|
"""Single individual in evolution population"""
|
||||||
|
|
||||||
|
# Hyperparameters
|
||||||
|
learning_rate: float = 3e-4
|
||||||
|
batch_size: int = 8
|
||||||
|
warmup_steps: int = 1000
|
||||||
|
weight_decay: float = 0.1
|
||||||
|
|
||||||
|
# Architecture choices
|
||||||
|
rope_theta: float = 10000.0
|
||||||
|
hidden_act: str = "swiglu"
|
||||||
|
norm_type: str = "rmsnorm"
|
||||||
|
|
||||||
|
# Fitness scores
|
||||||
|
loss: Optional[float] = None
|
||||||
|
perplexity: Optional[float] = None
|
||||||
|
latency_ms: Optional[float] = None
|
||||||
|
memory_mb: Optional[float] = None
|
||||||
|
quality_score: Optional[float] = None
|
||||||
|
fitness: Optional[float] = None
|
||||||
|
|
||||||
|
# Metadata
|
||||||
|
generation: int = 0
|
||||||
|
parent_ids: List[int] = field(default_factory=list)
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convert to dictionary"""
|
||||||
|
return {
|
||||||
|
'learning_rate': self.learning_rate,
|
||||||
|
'batch_size': self.batch_size,
|
||||||
|
'warmup_steps': self.warmup_steps,
|
||||||
|
'weight_decay': self.weight_decay,
|
||||||
|
'rope_theta': self.rope_theta,
|
||||||
|
'hidden_act': self.hidden_act,
|
||||||
|
'norm_type': self.norm_type,
|
||||||
|
'loss': self.loss,
|
||||||
|
'perplexity': self.perplexity,
|
||||||
|
'latency_ms': self.latency_ms,
|
||||||
|
'memory_mb': self.memory_mb,
|
||||||
|
'quality_score': self.quality_score,
|
||||||
|
'fitness': self.fitness,
|
||||||
|
'generation': self.generation,
|
||||||
|
'parent_ids': self.parent_ids,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data: Dict[str, Any]) -> 'Individual':
|
||||||
|
"""Create from dictionary"""
|
||||||
|
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
318
nova_evo/evolution.py
Normal file
318
nova_evo/evolution.py
Normal file
@@ -0,0 +1,318 @@
|
|||||||
|
"""
|
||||||
|
NOVA-EVO: Genetic algorithm for hyperparameter and architecture search
|
||||||
|
"""
|
||||||
|
|
||||||
|
import random
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Tuple, Optional
|
||||||
|
import time
|
||||||
|
from tqdm import tqdm
|
||||||
|
import copy
|
||||||
|
|
||||||
|
from .config import EvolutionConfig, Individual
|
||||||
|
from .fitness import FitnessEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
class EvolutionEngine:
|
||||||
|
"""
|
||||||
|
Genetic algorithm engine for evolving NOVA configurations
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Multi-objective fitness (loss, latency, memory, quality)
|
||||||
|
- Elitism with Pareto selection
|
||||||
|
- Mutation and crossover
|
||||||
|
- Hall of Fame for best individuals
|
||||||
|
- Rollback on regression
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: EvolutionConfig,
|
||||||
|
fitness_evaluator: FitnessEvaluator,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
config: Evolution configuration
|
||||||
|
fitness_evaluator: Fitness evaluation engine
|
||||||
|
"""
|
||||||
|
self.config = config
|
||||||
|
self.evaluator = fitness_evaluator
|
||||||
|
|
||||||
|
# Population
|
||||||
|
self.population: List[Individual] = []
|
||||||
|
self.generation = 0
|
||||||
|
|
||||||
|
# Hall of Fame - best individuals
|
||||||
|
self.hall_of_fame: List[Individual] = []
|
||||||
|
self.max_hof_size = 10
|
||||||
|
|
||||||
|
# Tracking
|
||||||
|
self.evolution_history = []
|
||||||
|
self.start_time = None
|
||||||
|
|
||||||
|
# Setup
|
||||||
|
Path(config.save_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
random.seed(config.seed)
|
||||||
|
|
||||||
|
def initialize_population(self) -> List[Individual]:
|
||||||
|
"""Create initial random population"""
|
||||||
|
print(f"Initializing population of {self.config.population_size}...")
|
||||||
|
|
||||||
|
population = []
|
||||||
|
|
||||||
|
for i in range(self.config.population_size):
|
||||||
|
individual = Individual(
|
||||||
|
learning_rate=random.uniform(self.config.lr_min, self.config.lr_max) if self.config.search_learning_rate else 3e-4,
|
||||||
|
batch_size=random.choice(self.config.batch_size_options) if self.config.search_batch_size else 8,
|
||||||
|
warmup_steps=random.randint(self.config.warmup_min, self.config.warmup_max) if self.config.search_warmup_steps else 1000,
|
||||||
|
weight_decay=random.uniform(self.config.wd_min, self.config.wd_max) if self.config.search_weight_decay else 0.1,
|
||||||
|
rope_theta=random.choice(self.config.rope_theta_options) if self.config.search_rope_theta else 10000.0,
|
||||||
|
hidden_act=random.choice(self.config.activation_options) if self.config.search_activation else "swiglu",
|
||||||
|
norm_type=random.choice(self.config.norm_options) if self.config.search_norm else "rmsnorm",
|
||||||
|
generation=0,
|
||||||
|
)
|
||||||
|
population.append(individual)
|
||||||
|
|
||||||
|
return population
|
||||||
|
|
||||||
|
def evaluate_population(self, population: List[Individual]) -> List[Individual]:
|
||||||
|
"""Evaluate fitness for all individuals in population"""
|
||||||
|
print(f"\nEvaluating {len(population)} individuals...")
|
||||||
|
|
||||||
|
for idx, individual in enumerate(tqdm(population, desc="Evaluating")):
|
||||||
|
# Skip if already evaluated
|
||||||
|
if individual.fitness is not None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Evaluate
|
||||||
|
metrics = self.evaluator.evaluate(individual)
|
||||||
|
|
||||||
|
# Store metrics
|
||||||
|
individual.loss = metrics['loss']
|
||||||
|
individual.perplexity = metrics.get('perplexity')
|
||||||
|
individual.latency_ms = metrics.get('latency_ms')
|
||||||
|
individual.memory_mb = metrics.get('memory_mb')
|
||||||
|
individual.quality_score = metrics.get('quality_score', 0.0)
|
||||||
|
|
||||||
|
# Calculate multi-objective fitness
|
||||||
|
individual.fitness = self._calculate_fitness(individual)
|
||||||
|
|
||||||
|
return population
|
||||||
|
|
||||||
|
def _calculate_fitness(self, individual: Individual) -> float:
|
||||||
|
"""
|
||||||
|
Calculate multi-objective fitness score
|
||||||
|
|
||||||
|
Lower is better (we're minimizing)
|
||||||
|
"""
|
||||||
|
fitness = 0.0
|
||||||
|
|
||||||
|
# Loss component (lower is better)
|
||||||
|
if individual.loss is not None:
|
||||||
|
fitness += individual.loss * self.config.loss_weight
|
||||||
|
|
||||||
|
# Latency component (lower is better, normalized)
|
||||||
|
if individual.latency_ms is not None:
|
||||||
|
normalized_latency = individual.latency_ms / 1000.0 # Normalize to seconds
|
||||||
|
fitness += normalized_latency * self.config.latency_weight
|
||||||
|
|
||||||
|
# Memory component (lower is better, normalized)
|
||||||
|
if individual.memory_mb is not None:
|
||||||
|
normalized_memory = individual.memory_mb / 1000.0 # Normalize to GB
|
||||||
|
fitness += normalized_memory * self.config.memory_weight
|
||||||
|
|
||||||
|
# Quality component (higher is better, so negate)
|
||||||
|
if individual.quality_score is not None:
|
||||||
|
fitness -= individual.quality_score * self.config.quality_weight
|
||||||
|
|
||||||
|
return fitness
|
||||||
|
|
||||||
|
def select_parents(self, population: List[Individual]) -> List[Individual]:
|
||||||
|
"""
|
||||||
|
Select parents for next generation using elitism
|
||||||
|
|
||||||
|
Args:
|
||||||
|
population: Current population (should be evaluated)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Elite individuals to keep
|
||||||
|
"""
|
||||||
|
# Sort by fitness (lower is better)
|
||||||
|
sorted_pop = sorted(population, key=lambda x: x.fitness if x.fitness is not None else float('inf'))
|
||||||
|
|
||||||
|
# Select top performers
|
||||||
|
num_elite = max(1, int(len(population) * self.config.elite_ratio))
|
||||||
|
elite = sorted_pop[:num_elite]
|
||||||
|
|
||||||
|
return elite
|
||||||
|
|
||||||
|
def crossover(self, parent1: Individual, parent2: Individual) -> Individual:
|
||||||
|
"""
|
||||||
|
Create offspring by combining two parents
|
||||||
|
|
||||||
|
Uses uniform crossover - randomly picks from each parent
|
||||||
|
"""
|
||||||
|
child = Individual(
|
||||||
|
learning_rate=random.choice([parent1.learning_rate, parent2.learning_rate]),
|
||||||
|
batch_size=random.choice([parent1.batch_size, parent2.batch_size]),
|
||||||
|
warmup_steps=random.choice([parent1.warmup_steps, parent2.warmup_steps]),
|
||||||
|
weight_decay=random.choice([parent1.weight_decay, parent2.weight_decay]),
|
||||||
|
rope_theta=random.choice([parent1.rope_theta, parent2.rope_theta]),
|
||||||
|
hidden_act=random.choice([parent1.hidden_act, parent2.hidden_act]),
|
||||||
|
norm_type=random.choice([parent1.norm_type, parent2.norm_type]),
|
||||||
|
generation=self.generation + 1,
|
||||||
|
parent_ids=[id(parent1), id(parent2)],
|
||||||
|
)
|
||||||
|
|
||||||
|
return child
|
||||||
|
|
||||||
|
def mutate(self, individual: Individual) -> Individual:
|
||||||
|
"""
|
||||||
|
Mutate an individual with random changes
|
||||||
|
|
||||||
|
Args:
|
||||||
|
individual: Individual to mutate
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Mutated copy
|
||||||
|
"""
|
||||||
|
mutated = copy.deepcopy(individual)
|
||||||
|
mutated.generation = self.generation + 1
|
||||||
|
|
||||||
|
# Mutate each gene with some probability
|
||||||
|
if random.random() < self.config.mutation_rate:
|
||||||
|
mutated.learning_rate = random.uniform(self.config.lr_min, self.config.lr_max)
|
||||||
|
|
||||||
|
if random.random() < self.config.mutation_rate:
|
||||||
|
mutated.batch_size = random.choice(self.config.batch_size_options)
|
||||||
|
|
||||||
|
if random.random() < self.config.mutation_rate:
|
||||||
|
mutated.warmup_steps = random.randint(self.config.warmup_min, self.config.warmup_max)
|
||||||
|
|
||||||
|
if random.random() < self.config.mutation_rate:
|
||||||
|
mutated.weight_decay = random.uniform(self.config.wd_min, self.config.wd_max)
|
||||||
|
|
||||||
|
if random.random() < self.config.mutation_rate:
|
||||||
|
mutated.rope_theta = random.choice(self.config.rope_theta_options)
|
||||||
|
|
||||||
|
if random.random() < self.config.mutation_rate:
|
||||||
|
mutated.hidden_act = random.choice(self.config.activation_options)
|
||||||
|
|
||||||
|
if random.random() < self.config.mutation_rate:
|
||||||
|
mutated.norm_type = random.choice(self.config.norm_options)
|
||||||
|
|
||||||
|
# Reset fitness (needs re-evaluation)
|
||||||
|
mutated.fitness = None
|
||||||
|
mutated.loss = None
|
||||||
|
|
||||||
|
return mutated
|
||||||
|
|
||||||
|
def create_next_generation(self, parents: List[Individual]) -> List[Individual]:
|
||||||
|
"""Create next generation from parents"""
|
||||||
|
next_gen = []
|
||||||
|
|
||||||
|
# Keep elite unchanged
|
||||||
|
next_gen.extend(copy.deepcopy(parents))
|
||||||
|
|
||||||
|
# Fill rest with offspring
|
||||||
|
while len(next_gen) < self.config.population_size:
|
||||||
|
# Select two random parents
|
||||||
|
parent1, parent2 = random.sample(parents, 2)
|
||||||
|
|
||||||
|
# Crossover
|
||||||
|
child = self.crossover(parent1, parent2)
|
||||||
|
|
||||||
|
# Mutate
|
||||||
|
child = self.mutate(child)
|
||||||
|
|
||||||
|
next_gen.append(child)
|
||||||
|
|
||||||
|
return next_gen
|
||||||
|
|
||||||
|
def update_hall_of_fame(self, population: List[Individual]):
|
||||||
|
"""Update hall of fame with best individuals"""
|
||||||
|
# Add current best to hall of fame
|
||||||
|
for ind in population:
|
||||||
|
if ind.fitness is not None:
|
||||||
|
self.hall_of_fame.append(copy.deepcopy(ind))
|
||||||
|
|
||||||
|
# Sort by fitness
|
||||||
|
self.hall_of_fame.sort(key=lambda x: x.fitness if x.fitness is not None else float('inf'))
|
||||||
|
|
||||||
|
# Keep only top N
|
||||||
|
self.hall_of_fame = self.hall_of_fame[:self.max_hof_size]
|
||||||
|
|
||||||
|
def save_checkpoint(self):
|
||||||
|
"""Save evolution state"""
|
||||||
|
checkpoint_path = Path(self.config.save_dir) / f"generation_{self.generation}.json"
|
||||||
|
|
||||||
|
checkpoint = {
|
||||||
|
'generation': self.generation,
|
||||||
|
'population': [ind.to_dict() for ind in self.population],
|
||||||
|
'hall_of_fame': [ind.to_dict() for ind in self.hall_of_fame],
|
||||||
|
'config': self.config.__dict__,
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(checkpoint_path, 'w') as f:
|
||||||
|
json.dump(checkpoint, f, indent=2)
|
||||||
|
|
||||||
|
print(f" Checkpoint saved: {checkpoint_path}")
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""Run the evolution process"""
|
||||||
|
print("=" * 60)
|
||||||
|
print("NOVA-EVO: Genetic Algorithm Evolution")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
self.start_time = time.time()
|
||||||
|
|
||||||
|
# Initialize population
|
||||||
|
self.population = self.initialize_population()
|
||||||
|
|
||||||
|
# Evolution loop
|
||||||
|
for gen in range(self.config.num_generations):
|
||||||
|
self.generation = gen
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Generation {gen + 1}/{self.config.num_generations}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Evaluate
|
||||||
|
self.population = self.evaluate_population(self.population)
|
||||||
|
|
||||||
|
# Select parents
|
||||||
|
parents = self.select_parents(self.population)
|
||||||
|
|
||||||
|
# Update hall of fame
|
||||||
|
self.update_hall_of_fame(self.population)
|
||||||
|
|
||||||
|
# Report best individual
|
||||||
|
best = self.hall_of_fame[0] if self.hall_of_fame else None
|
||||||
|
if best:
|
||||||
|
print(f"\n🏆 Best individual so far:")
|
||||||
|
print(f" Fitness: {best.fitness:.4f}")
|
||||||
|
print(f" Loss: {best.loss:.4f}")
|
||||||
|
print(f" LR: {best.learning_rate:.2e}, BS: {best.batch_size}")
|
||||||
|
print(f" Activation: {best.hidden_act}, Norm: {best.norm_type}")
|
||||||
|
|
||||||
|
# Checkpoint
|
||||||
|
if (gen + 1) % self.config.checkpoint_every_n_generations == 0:
|
||||||
|
self.save_checkpoint()
|
||||||
|
|
||||||
|
# Create next generation
|
||||||
|
if gen < self.config.num_generations - 1:
|
||||||
|
self.population = self.create_next_generation(parents)
|
||||||
|
|
||||||
|
# Final checkpoint
|
||||||
|
self.save_checkpoint()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Evolution Complete!")
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Total time: {(time.time() - self.start_time) / 3600:.2f} hours")
|
||||||
|
print(f"\nTop 3 individuals:")
|
||||||
|
for i, ind in enumerate(self.hall_of_fame[:3]):
|
||||||
|
print(f"\n{i+1}. Fitness: {ind.fitness:.4f}")
|
||||||
|
print(f" Loss: {ind.loss:.4f}, LR: {ind.learning_rate:.2e}")
|
||||||
|
print(f" Batch size: {ind.batch_size}, Warmup: {ind.warmup_steps}")
|
||||||
|
print(f" Activation: {ind.hidden_act}, Norm: {ind.norm_type}")
|
243
nova_evo/fitness.py
Normal file
243
nova_evo/fitness.py
Normal file
@@ -0,0 +1,243 @@
|
|||||||
|
"""
|
||||||
|
Fitness evaluator for NOVA-EVO
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import time
|
||||||
|
from typing import Dict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .config import Individual, EvolutionConfig
|
||||||
|
from nova_core import NovaTransformer, ModelConfig
|
||||||
|
from nova_train import NovaTrainer, TrainingConfig
|
||||||
|
|
||||||
|
|
||||||
|
class FitnessEvaluator:
|
||||||
|
"""
|
||||||
|
Evaluates fitness of individuals by training and measuring metrics
|
||||||
|
|
||||||
|
Metrics:
|
||||||
|
- Loss/perplexity (quality of learning)
|
||||||
|
- Latency (inference speed)
|
||||||
|
- Memory usage (peak RAM/VRAM)
|
||||||
|
- Chat quality (optional, if eval set available)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_model_config: ModelConfig,
|
||||||
|
evo_config: EvolutionConfig,
|
||||||
|
train_dataset,
|
||||||
|
eval_dataset=None,
|
||||||
|
device: str = "auto",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
base_model_config: Base model configuration
|
||||||
|
evo_config: Evolution configuration
|
||||||
|
train_dataset: Training dataset for fitness eval
|
||||||
|
eval_dataset: Optional evaluation dataset
|
||||||
|
device: Device for training
|
||||||
|
"""
|
||||||
|
self.base_model_config = base_model_config
|
||||||
|
self.evo_config = evo_config
|
||||||
|
self.train_dataset = train_dataset
|
||||||
|
self.eval_dataset = eval_dataset
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
def evaluate(self, individual: Individual) -> Dict[str, float]:
|
||||||
|
"""
|
||||||
|
Evaluate fitness of an individual
|
||||||
|
|
||||||
|
Args:
|
||||||
|
individual: Individual to evaluate
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary of metrics
|
||||||
|
"""
|
||||||
|
# Create model with individual's architecture choices
|
||||||
|
model_config = self._create_model_config(individual)
|
||||||
|
model = NovaTransformer(model_config)
|
||||||
|
|
||||||
|
# Create training config with individual's hyperparameters
|
||||||
|
train_config = self._create_training_config(individual)
|
||||||
|
|
||||||
|
# Train for eval_steps
|
||||||
|
train_loader = self._create_dataloader(
|
||||||
|
self.train_dataset,
|
||||||
|
batch_size=individual.batch_size
|
||||||
|
)
|
||||||
|
|
||||||
|
# Quick training
|
||||||
|
loss = self._quick_train(model, train_config, train_loader)
|
||||||
|
|
||||||
|
# Measure latency
|
||||||
|
latency_ms = self._measure_latency(model)
|
||||||
|
|
||||||
|
# Measure memory
|
||||||
|
memory_mb = self._measure_memory(model)
|
||||||
|
|
||||||
|
# Calculate perplexity
|
||||||
|
perplexity = torch.exp(torch.tensor(loss)).item() if loss < 100 else float('inf')
|
||||||
|
|
||||||
|
return {
|
||||||
|
'loss': loss,
|
||||||
|
'perplexity': perplexity,
|
||||||
|
'latency_ms': latency_ms,
|
||||||
|
'memory_mb': memory_mb,
|
||||||
|
'quality_score': 0.0, # TODO: Implement chat quality eval
|
||||||
|
}
|
||||||
|
|
||||||
|
def _create_model_config(self, individual: Individual) -> ModelConfig:
|
||||||
|
"""Create model config from individual's genes"""
|
||||||
|
config = ModelConfig(
|
||||||
|
vocab_size=self.base_model_config.vocab_size,
|
||||||
|
hidden_size=self.base_model_config.hidden_size,
|
||||||
|
num_hidden_layers=self.base_model_config.num_hidden_layers,
|
||||||
|
num_attention_heads=self.base_model_config.num_attention_heads,
|
||||||
|
intermediate_size=self.base_model_config.intermediate_size,
|
||||||
|
max_position_embeddings=self.base_model_config.max_position_embeddings,
|
||||||
|
# Individual's choices
|
||||||
|
rope_theta=individual.rope_theta,
|
||||||
|
hidden_act=individual.hidden_act,
|
||||||
|
norm_type=individual.norm_type,
|
||||||
|
)
|
||||||
|
return config
|
||||||
|
|
||||||
|
def _create_training_config(self, individual: Individual) -> TrainingConfig:
|
||||||
|
"""Create training config from individual's hyperparameters"""
|
||||||
|
config = TrainingConfig(
|
||||||
|
learning_rate=individual.learning_rate,
|
||||||
|
batch_size=individual.batch_size,
|
||||||
|
warmup_steps=individual.warmup_steps,
|
||||||
|
weight_decay=individual.weight_decay,
|
||||||
|
num_epochs=1, # Just one pass for eval
|
||||||
|
save_steps=999999, # Don't save during eval
|
||||||
|
device=self.device,
|
||||||
|
)
|
||||||
|
return config
|
||||||
|
|
||||||
|
def _create_dataloader(self, dataset, batch_size: int):
|
||||||
|
"""Create dataloader for training"""
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
return DataLoader(
|
||||||
|
dataset,
|
||||||
|
batch_size=batch_size,
|
||||||
|
shuffle=True,
|
||||||
|
num_workers=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _quick_train(
|
||||||
|
self,
|
||||||
|
model: NovaTransformer,
|
||||||
|
train_config: TrainingConfig,
|
||||||
|
train_loader
|
||||||
|
) -> float:
|
||||||
|
"""
|
||||||
|
Quick training for evaluation
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Final loss
|
||||||
|
"""
|
||||||
|
# Limit to eval_steps
|
||||||
|
limited_loader = []
|
||||||
|
for i, batch in enumerate(train_loader):
|
||||||
|
if i >= self.evo_config.eval_steps:
|
||||||
|
break
|
||||||
|
limited_loader.append(batch)
|
||||||
|
|
||||||
|
if not limited_loader:
|
||||||
|
return float('inf')
|
||||||
|
|
||||||
|
# Simple training loop
|
||||||
|
device = torch.device(self.device if self.device != "auto" else "cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
model.to(device)
|
||||||
|
model.train()
|
||||||
|
|
||||||
|
optimizer = torch.optim.AdamW(
|
||||||
|
model.parameters(),
|
||||||
|
lr=train_config.learning_rate,
|
||||||
|
weight_decay=train_config.weight_decay,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_loss = 0.0
|
||||||
|
num_batches = 0
|
||||||
|
|
||||||
|
for batch in limited_loader:
|
||||||
|
input_ids = batch['input_ids'].to(device)
|
||||||
|
labels = batch.get('labels', input_ids).to(device)
|
||||||
|
|
||||||
|
outputs = model(input_ids=input_ids)
|
||||||
|
logits = outputs['logits']
|
||||||
|
|
||||||
|
# Calculate loss
|
||||||
|
shift_logits = logits[..., :-1, :].contiguous()
|
||||||
|
shift_labels = labels[..., 1:].contiguous()
|
||||||
|
|
||||||
|
loss = torch.nn.functional.cross_entropy(
|
||||||
|
shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
|
shift_labels.view(-1),
|
||||||
|
ignore_index=-100
|
||||||
|
)
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
total_loss += loss.item()
|
||||||
|
num_batches += 1
|
||||||
|
|
||||||
|
return total_loss / num_batches if num_batches > 0 else float('inf')
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def _measure_latency(self, model: NovaTransformer) -> float:
|
||||||
|
"""
|
||||||
|
Measure average inference latency in milliseconds
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: Model to measure
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Average latency in ms
|
||||||
|
"""
|
||||||
|
device = next(model.parameters()).device
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# Dummy input
|
||||||
|
input_ids = torch.randint(0, model.config.vocab_size, (1, 128), device=device)
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
for _ in range(3):
|
||||||
|
_ = model(input_ids=input_ids)
|
||||||
|
|
||||||
|
# Measure
|
||||||
|
num_runs = 10
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
for _ in range(num_runs):
|
||||||
|
_ = model(input_ids=input_ids)
|
||||||
|
|
||||||
|
if device.type == 'cuda':
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
elapsed = (time.time() - start) / num_runs
|
||||||
|
return elapsed * 1000 # Convert to ms
|
||||||
|
|
||||||
|
def _measure_memory(self, model: NovaTransformer) -> float:
|
||||||
|
"""
|
||||||
|
Measure peak memory usage in MB
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: Model to measure
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Peak memory in MB
|
||||||
|
"""
|
||||||
|
# Count parameters
|
||||||
|
num_params = sum(p.numel() for p in model.parameters())
|
||||||
|
|
||||||
|
# Approximate memory (4 bytes per float32 parameter)
|
||||||
|
memory_mb = (num_params * 4) / (1024 ** 2)
|
||||||
|
|
||||||
|
return memory_mb
|
11
nova_tokenizer/__init__.py
Normal file
11
nova_tokenizer/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
"""
|
||||||
|
NOVA Tokenizer - SentencePiece-based tokenization
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .tokenizer import NovaTokenizer
|
||||||
|
from .trainer import train_tokenizer
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'NovaTokenizer',
|
||||||
|
'train_tokenizer',
|
||||||
|
]
|
157
nova_tokenizer/tokenizer.py
Normal file
157
nova_tokenizer/tokenizer.py
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
"""
|
||||||
|
NOVA Tokenizer - SentencePiece-based tokenization
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sentencepiece as spm
|
||||||
|
from typing import List, Union, Optional
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class NovaTokenizer:
|
||||||
|
"""
|
||||||
|
SentencePiece tokenizer for NOVA
|
||||||
|
|
||||||
|
Supports both BPE and Unigram models with special tokens
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_path: str,
|
||||||
|
add_bos: bool = True,
|
||||||
|
add_eos: bool = True,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
model_path: Path to SentencePiece model file (.model)
|
||||||
|
add_bos: Whether to add BOS token by default
|
||||||
|
add_eos: Whether to add EOS token by default
|
||||||
|
"""
|
||||||
|
if not os.path.exists(model_path):
|
||||||
|
raise FileNotFoundError(f"Tokenizer model not found: {model_path}")
|
||||||
|
|
||||||
|
self.sp = spm.SentencePieceProcessor()
|
||||||
|
self.sp.Load(model_path)
|
||||||
|
|
||||||
|
self.add_bos = add_bos
|
||||||
|
self.add_eos = add_eos
|
||||||
|
|
||||||
|
# Special token IDs
|
||||||
|
self.bos_id = self.sp.bos_id()
|
||||||
|
self.eos_id = self.sp.eos_id()
|
||||||
|
self.pad_id = self.sp.pad_id()
|
||||||
|
self.unk_id = self.sp.unk_id()
|
||||||
|
|
||||||
|
# Vocabulary info
|
||||||
|
self.vocab_size = self.sp.vocab_size()
|
||||||
|
|
||||||
|
def encode(
|
||||||
|
self,
|
||||||
|
text: Union[str, List[str]],
|
||||||
|
add_bos: Optional[bool] = None,
|
||||||
|
add_eos: Optional[bool] = None,
|
||||||
|
) -> Union[List[int], List[List[int]]]:
|
||||||
|
"""
|
||||||
|
Encode text to token IDs
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Single string or list of strings
|
||||||
|
add_bos: Override default BOS behavior
|
||||||
|
add_eos: Override default EOS behavior
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Token IDs (single list or list of lists)
|
||||||
|
"""
|
||||||
|
add_bos = self.add_bos if add_bos is None else add_bos
|
||||||
|
add_eos = self.add_eos if add_eos is None else add_eos
|
||||||
|
|
||||||
|
if isinstance(text, str):
|
||||||
|
ids = self.sp.Encode(text)
|
||||||
|
if add_bos:
|
||||||
|
ids = [self.bos_id] + ids
|
||||||
|
if add_eos:
|
||||||
|
ids = ids + [self.eos_id]
|
||||||
|
return ids
|
||||||
|
else:
|
||||||
|
return [self.encode(t, add_bos, add_eos) for t in text]
|
||||||
|
|
||||||
|
def decode(
|
||||||
|
self,
|
||||||
|
ids: Union[List[int], List[List[int]]],
|
||||||
|
skip_special_tokens: bool = True,
|
||||||
|
) -> Union[str, List[str]]:
|
||||||
|
"""
|
||||||
|
Decode token IDs to text
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ids: Single list of IDs or list of lists
|
||||||
|
skip_special_tokens: Whether to remove special tokens
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Decoded text (single string or list of strings)
|
||||||
|
"""
|
||||||
|
if isinstance(ids[0], list):
|
||||||
|
return [self.decode(i, skip_special_tokens) for i in ids]
|
||||||
|
|
||||||
|
if skip_special_tokens:
|
||||||
|
# Remove BOS, EOS, PAD tokens
|
||||||
|
ids = [i for i in ids if i not in [self.bos_id, self.eos_id, self.pad_id]]
|
||||||
|
|
||||||
|
return self.sp.Decode(ids)
|
||||||
|
|
||||||
|
def encode_batch(
|
||||||
|
self,
|
||||||
|
texts: List[str],
|
||||||
|
add_bos: Optional[bool] = None,
|
||||||
|
add_eos: Optional[bool] = None,
|
||||||
|
) -> List[List[int]]:
|
||||||
|
"""Encode batch of texts"""
|
||||||
|
return self.encode(texts, add_bos, add_eos)
|
||||||
|
|
||||||
|
def decode_batch(
|
||||||
|
self,
|
||||||
|
ids_list: List[List[int]],
|
||||||
|
skip_special_tokens: bool = True,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Decode batch of token ID lists"""
|
||||||
|
return self.decode(ids_list, skip_special_tokens)
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
"""Return vocabulary size"""
|
||||||
|
return self.vocab_size
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
text: Union[str, List[str]],
|
||||||
|
add_bos: Optional[bool] = None,
|
||||||
|
add_eos: Optional[bool] = None,
|
||||||
|
) -> Union[List[int], List[List[int]]]:
|
||||||
|
"""Shorthand for encode"""
|
||||||
|
return self.encode(text, add_bos, add_eos)
|
||||||
|
|
||||||
|
def get_piece(self, token_id: int) -> str:
|
||||||
|
"""Get string piece for token ID"""
|
||||||
|
return self.sp.IdToPiece(token_id)
|
||||||
|
|
||||||
|
def get_id(self, piece: str) -> int:
|
||||||
|
"""Get token ID for string piece"""
|
||||||
|
return self.sp.PieceToId(piece)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def bos_token(self) -> str:
|
||||||
|
"""BOS token string"""
|
||||||
|
return self.sp.IdToPiece(self.bos_id) if self.bos_id >= 0 else ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def eos_token(self) -> str:
|
||||||
|
"""EOS token string"""
|
||||||
|
return self.sp.IdToPiece(self.eos_id) if self.eos_id >= 0 else ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pad_token(self) -> str:
|
||||||
|
"""PAD token string"""
|
||||||
|
return self.sp.IdToPiece(self.pad_id) if self.pad_id >= 0 else ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def unk_token(self) -> str:
|
||||||
|
"""UNK token string"""
|
||||||
|
return self.sp.IdToPiece(self.unk_id) if self.unk_id >= 0 else ""
|
152
nova_tokenizer/trainer.py
Normal file
152
nova_tokenizer/trainer.py
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
"""
|
||||||
|
SentencePiece tokenizer trainer
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sentencepiece as spm
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
|
||||||
|
def train_tokenizer(
|
||||||
|
input_files: List[str],
|
||||||
|
model_prefix: str,
|
||||||
|
vocab_size: int = 32000,
|
||||||
|
model_type: str = "bpe", # or "unigram"
|
||||||
|
character_coverage: float = 0.9995,
|
||||||
|
num_threads: int = 4,
|
||||||
|
user_defined_symbols: Optional[List[str]] = None,
|
||||||
|
max_sentence_length: int = 16384,
|
||||||
|
shuffle_input_sentence: bool = True,
|
||||||
|
seed_sentencepiece_size: int = 1000000,
|
||||||
|
**kwargs
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Train a SentencePiece tokenizer
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_files: List of text file paths for training
|
||||||
|
model_prefix: Output model path prefix (will create .model and .vocab files)
|
||||||
|
vocab_size: Target vocabulary size
|
||||||
|
model_type: 'bpe' or 'unigram'
|
||||||
|
character_coverage: Character coverage (0.9995 for multilingual, 1.0 for single language)
|
||||||
|
num_threads: Number of threads for training
|
||||||
|
user_defined_symbols: Optional list of user-defined symbols to add
|
||||||
|
max_sentence_length: Maximum sentence length
|
||||||
|
shuffle_input_sentence: Whether to shuffle input sentences
|
||||||
|
seed_sentencepiece_size: Number of sentences to use for initial seed
|
||||||
|
**kwargs: Additional arguments to pass to SentencePiece trainer
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to trained model file
|
||||||
|
"""
|
||||||
|
# Validate input files
|
||||||
|
for f in input_files:
|
||||||
|
if not Path(f).exists():
|
||||||
|
raise FileNotFoundError(f"Input file not found: {f}")
|
||||||
|
|
||||||
|
# Prepare training arguments
|
||||||
|
train_args = {
|
||||||
|
'input': ','.join(input_files),
|
||||||
|
'model_prefix': model_prefix,
|
||||||
|
'vocab_size': vocab_size,
|
||||||
|
'model_type': model_type,
|
||||||
|
'character_coverage': character_coverage,
|
||||||
|
'num_threads': num_threads,
|
||||||
|
'max_sentence_length': max_sentence_length,
|
||||||
|
'shuffle_input_sentence': shuffle_input_sentence,
|
||||||
|
'seed_sentencepiece_size': seed_sentencepiece_size,
|
||||||
|
|
||||||
|
# Special tokens
|
||||||
|
'pad_id': 0,
|
||||||
|
'unk_id': 1,
|
||||||
|
'bos_id': 2,
|
||||||
|
'eos_id': 3,
|
||||||
|
'pad_piece': '<pad>',
|
||||||
|
'unk_piece': '<unk>',
|
||||||
|
'bos_piece': '<s>',
|
||||||
|
'eos_piece': '</s>',
|
||||||
|
|
||||||
|
# User-defined symbols (e.g., for special control tokens)
|
||||||
|
'user_defined_symbols': user_defined_symbols or [],
|
||||||
|
|
||||||
|
# Normalization
|
||||||
|
'normalization_rule_name': 'nmt_nfkc_cf', # Standard normalization
|
||||||
|
'remove_extra_whitespaces': True,
|
||||||
|
'split_by_unicode_script': True,
|
||||||
|
'split_by_whitespace': True,
|
||||||
|
'split_by_number': True,
|
||||||
|
'split_digits': True,
|
||||||
|
'byte_fallback': True, # Handle unknown bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add any additional kwargs
|
||||||
|
train_args.update(kwargs)
|
||||||
|
|
||||||
|
# Train the model
|
||||||
|
print(f"Training {model_type.upper()} tokenizer with vocab size {vocab_size}...")
|
||||||
|
print(f"Input files: {len(input_files)} file(s)")
|
||||||
|
print(f"Output: {model_prefix}.model")
|
||||||
|
|
||||||
|
spm.SentencePieceTrainer.Train(**{k: str(v) if isinstance(v, list) else v
|
||||||
|
for k, v in train_args.items()})
|
||||||
|
|
||||||
|
model_path = f"{model_prefix}.model"
|
||||||
|
|
||||||
|
# Verify the model was created
|
||||||
|
if not Path(model_path).exists():
|
||||||
|
raise RuntimeError(f"Model training failed - {model_path} not created")
|
||||||
|
|
||||||
|
# Print vocab info
|
||||||
|
sp = spm.SentencePieceProcessor()
|
||||||
|
sp.Load(model_path)
|
||||||
|
print(f"✓ Tokenizer trained successfully!")
|
||||||
|
print(f" Vocabulary size: {sp.vocab_size()}")
|
||||||
|
print(f" BOS token: {sp.IdToPiece(sp.bos_id())} (ID: {sp.bos_id()})")
|
||||||
|
print(f" EOS token: {sp.IdToPiece(sp.eos_id())} (ID: {sp.eos_id()})")
|
||||||
|
print(f" PAD token: {sp.IdToPiece(sp.pad_id())} (ID: {sp.pad_id()})")
|
||||||
|
print(f" UNK token: {sp.IdToPiece(sp.unk_id())} (ID: {sp.unk_id()})")
|
||||||
|
|
||||||
|
return model_path
|
||||||
|
|
||||||
|
|
||||||
|
def train_from_text(
|
||||||
|
texts: List[str],
|
||||||
|
model_prefix: str,
|
||||||
|
vocab_size: int = 32000,
|
||||||
|
model_type: str = "bpe",
|
||||||
|
**kwargs
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Train tokenizer directly from list of texts (without needing files)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: List of text strings
|
||||||
|
model_prefix: Output model path prefix
|
||||||
|
vocab_size: Target vocabulary size
|
||||||
|
model_type: 'bpe' or 'unigram'
|
||||||
|
**kwargs: Additional arguments
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to trained model file
|
||||||
|
"""
|
||||||
|
# Write texts to temporary file
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as f:
|
||||||
|
for text in texts:
|
||||||
|
f.write(text.strip() + '\n')
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Train using the temporary file
|
||||||
|
model_path = train_tokenizer(
|
||||||
|
input_files=[temp_file],
|
||||||
|
model_prefix=model_prefix,
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
model_type=model_type,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
# Clean up temp file
|
||||||
|
Path(temp_file).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
return model_path
|
11
nova_train/__init__.py
Normal file
11
nova_train/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
"""
|
||||||
|
NOVA Train - Training pipeline with AMP, gradient checkpointing, DDP
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .trainer import NovaTrainer
|
||||||
|
from .config import TrainingConfig
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'NovaTrainer',
|
||||||
|
'TrainingConfig',
|
||||||
|
]
|
74
nova_train/config.py
Normal file
74
nova_train/config.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
"""
|
||||||
|
Training configuration
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TrainingConfig:
|
||||||
|
"""Configuration for training NOVA models"""
|
||||||
|
|
||||||
|
# Model
|
||||||
|
model_name: str = "nova-125m"
|
||||||
|
model_config_path: Optional[str] = None
|
||||||
|
|
||||||
|
# Data
|
||||||
|
train_data_path: str = "data/train"
|
||||||
|
val_data_path: str = "data/val"
|
||||||
|
max_seq_length: int = 2048
|
||||||
|
|
||||||
|
# Training hyperparameters
|
||||||
|
num_epochs: int = 10
|
||||||
|
batch_size: int = 8
|
||||||
|
gradient_accumulation_steps: int = 4
|
||||||
|
learning_rate: float = 3e-4
|
||||||
|
weight_decay: float = 0.1
|
||||||
|
max_grad_norm: float = 1.0
|
||||||
|
warmup_steps: int = 1000
|
||||||
|
lr_scheduler: str = "cosine" # or "linear", "constant"
|
||||||
|
|
||||||
|
# Optimization
|
||||||
|
optimizer: str = "adamw" # or "lion", "adafactor"
|
||||||
|
adam_beta1: float = 0.9
|
||||||
|
adam_beta2: float = 0.95
|
||||||
|
adam_epsilon: float = 1e-8
|
||||||
|
|
||||||
|
# Mixed precision and efficiency
|
||||||
|
use_amp: bool = True # Automatic Mixed Precision
|
||||||
|
gradient_checkpointing: bool = False
|
||||||
|
use_ddp: bool = False # Distributed Data Parallel
|
||||||
|
|
||||||
|
# Checkpointing
|
||||||
|
save_dir: str = "checkpoints"
|
||||||
|
save_steps: int = 1000
|
||||||
|
save_total_limit: int = 5
|
||||||
|
resume_from_checkpoint: Optional[str] = None
|
||||||
|
|
||||||
|
# Evaluation
|
||||||
|
eval_steps: int = 500
|
||||||
|
eval_strategy: str = "steps" # or "epoch"
|
||||||
|
logging_steps: int = 100
|
||||||
|
|
||||||
|
# Early stopping
|
||||||
|
early_stopping: bool = False
|
||||||
|
early_stopping_patience: int = 3
|
||||||
|
early_stopping_threshold: float = 0.001
|
||||||
|
|
||||||
|
# Reproducibility
|
||||||
|
seed: int = 42
|
||||||
|
|
||||||
|
# Device
|
||||||
|
device: str = "auto" # "auto", "cpu", "cuda", "cuda:0", etc.
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
log_to_wandb: bool = False
|
||||||
|
wandb_project: Optional[str] = None
|
||||||
|
wandb_run_name: Optional[str] = None
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
"""Validate configuration"""
|
||||||
|
assert self.batch_size > 0, "batch_size must be positive"
|
||||||
|
assert self.learning_rate > 0, "learning_rate must be positive"
|
||||||
|
assert self.gradient_accumulation_steps > 0, "gradient_accumulation_steps must be positive"
|
330
nova_train/trainer.py
Normal file
330
nova_train/trainer.py
Normal file
@@ -0,0 +1,330 @@
|
|||||||
|
"""
|
||||||
|
NOVA Trainer - Training loop with AMP, gradient checkpointing, DDP
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.optim as optim
|
||||||
|
from torch.cuda.amp import autocast, GradScaler
|
||||||
|
from torch.utils.data import DataLoader, DistributedSampler
|
||||||
|
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||||
|
import torch.distributed as dist
|
||||||
|
from pathlib import Path
|
||||||
|
from tqdm import tqdm
|
||||||
|
from typing import Optional, Dict, Any
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import math
|
||||||
|
|
||||||
|
from .config import TrainingConfig
|
||||||
|
from nova_core import NovaTransformer, ModelConfig
|
||||||
|
|
||||||
|
|
||||||
|
class NovaTrainer:
|
||||||
|
"""
|
||||||
|
Trainer for NOVA models with support for:
|
||||||
|
- Automatic Mixed Precision (AMP)
|
||||||
|
- Gradient checkpointing
|
||||||
|
- Distributed Data Parallel (DDP)
|
||||||
|
- Resume from checkpoint
|
||||||
|
- Early stopping
|
||||||
|
- Cosine learning rate schedule with warmup
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: NovaTransformer,
|
||||||
|
train_config: TrainingConfig,
|
||||||
|
train_dataloader: DataLoader,
|
||||||
|
val_dataloader: Optional[DataLoader] = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
model: NOVA transformer model
|
||||||
|
train_config: Training configuration
|
||||||
|
train_dataloader: Training data loader
|
||||||
|
val_dataloader: Optional validation data loader
|
||||||
|
"""
|
||||||
|
self.config = train_config
|
||||||
|
self.model = model
|
||||||
|
self.train_dataloader = train_dataloader
|
||||||
|
self.val_dataloader = val_dataloader
|
||||||
|
|
||||||
|
# Setup device
|
||||||
|
self.device = self._setup_device()
|
||||||
|
self.model.to(self.device)
|
||||||
|
|
||||||
|
# Setup distributed training if needed
|
||||||
|
self.is_ddp = train_config.use_ddp and torch.cuda.device_count() > 1
|
||||||
|
if self.is_ddp:
|
||||||
|
self.model = DDP(self.model)
|
||||||
|
|
||||||
|
# Setup optimizer
|
||||||
|
self.optimizer = self._create_optimizer()
|
||||||
|
|
||||||
|
# Setup learning rate scheduler
|
||||||
|
total_steps = len(train_dataloader) * train_config.num_epochs // train_config.gradient_accumulation_steps
|
||||||
|
self.scheduler = self._create_scheduler(total_steps)
|
||||||
|
|
||||||
|
# Setup AMP
|
||||||
|
self.use_amp = train_config.use_amp and self.device.type == 'cuda'
|
||||||
|
self.scaler = GradScaler() if self.use_amp else None
|
||||||
|
|
||||||
|
# Tracking
|
||||||
|
self.global_step = 0
|
||||||
|
self.current_epoch = 0
|
||||||
|
self.best_val_loss = float('inf')
|
||||||
|
self.patience_counter = 0
|
||||||
|
|
||||||
|
# Create save directory
|
||||||
|
Path(train_config.save_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _setup_device(self) -> torch.device:
|
||||||
|
"""Setup training device"""
|
||||||
|
if self.config.device == "auto":
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device("cuda")
|
||||||
|
else:
|
||||||
|
return torch.device("cpu")
|
||||||
|
else:
|
||||||
|
return torch.device(self.config.device)
|
||||||
|
|
||||||
|
def _create_optimizer(self) -> optim.Optimizer:
|
||||||
|
"""Create optimizer"""
|
||||||
|
# Separate parameters with and without weight decay
|
||||||
|
decay_params = []
|
||||||
|
no_decay_params = []
|
||||||
|
|
||||||
|
for name, param in self.model.named_parameters():
|
||||||
|
if param.requires_grad:
|
||||||
|
# Don't apply weight decay to biases and layer norms
|
||||||
|
if 'bias' in name or 'norm' in name:
|
||||||
|
no_decay_params.append(param)
|
||||||
|
else:
|
||||||
|
decay_params.append(param)
|
||||||
|
|
||||||
|
param_groups = [
|
||||||
|
{'params': decay_params, 'weight_decay': self.config.weight_decay},
|
||||||
|
{'params': no_decay_params, 'weight_decay': 0.0}
|
||||||
|
]
|
||||||
|
|
||||||
|
if self.config.optimizer.lower() == "adamw":
|
||||||
|
return optim.AdamW(
|
||||||
|
param_groups,
|
||||||
|
lr=self.config.learning_rate,
|
||||||
|
betas=(self.config.adam_beta1, self.config.adam_beta2),
|
||||||
|
eps=self.config.adam_epsilon
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown optimizer: {self.config.optimizer}")
|
||||||
|
|
||||||
|
def _create_scheduler(self, total_steps: int):
|
||||||
|
"""Create learning rate scheduler with warmup"""
|
||||||
|
if self.config.lr_scheduler == "cosine":
|
||||||
|
def lr_lambda(current_step: int):
|
||||||
|
# Warmup
|
||||||
|
if current_step < self.config.warmup_steps:
|
||||||
|
return float(current_step) / float(max(1, self.config.warmup_steps))
|
||||||
|
# Cosine decay
|
||||||
|
progress = float(current_step - self.config.warmup_steps) / float(max(1, total_steps - self.config.warmup_steps))
|
||||||
|
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * progress)))
|
||||||
|
|
||||||
|
return optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda)
|
||||||
|
|
||||||
|
elif self.config.lr_scheduler == "linear":
|
||||||
|
def lr_lambda(current_step: int):
|
||||||
|
if current_step < self.config.warmup_steps:
|
||||||
|
return float(current_step) / float(max(1, self.config.warmup_steps))
|
||||||
|
return max(0.0, float(total_steps - current_step) / float(max(1, total_steps - self.config.warmup_steps)))
|
||||||
|
|
||||||
|
return optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda)
|
||||||
|
|
||||||
|
else: # constant
|
||||||
|
return optim.lr_scheduler.LambdaLR(self.optimizer, lambda _: 1.0)
|
||||||
|
|
||||||
|
def train(self):
|
||||||
|
"""Main training loop"""
|
||||||
|
print(f"Starting training on {self.device}")
|
||||||
|
print(f" Num epochs: {self.config.num_epochs}")
|
||||||
|
print(f" Batch size: {self.config.batch_size}")
|
||||||
|
print(f" Gradient accumulation steps: {self.config.gradient_accumulation_steps}")
|
||||||
|
print(f" Learning rate: {self.config.learning_rate}")
|
||||||
|
print(f" Mixed precision: {self.use_amp}")
|
||||||
|
|
||||||
|
for epoch in range(self.current_epoch, self.config.num_epochs):
|
||||||
|
self.current_epoch = epoch
|
||||||
|
print(f"\nEpoch {epoch + 1}/{self.config.num_epochs}")
|
||||||
|
|
||||||
|
# Training
|
||||||
|
train_loss = self.train_epoch()
|
||||||
|
print(f" Train loss: {train_loss:.4f}")
|
||||||
|
|
||||||
|
# Validation
|
||||||
|
if self.val_dataloader is not None:
|
||||||
|
val_loss = self.evaluate()
|
||||||
|
print(f" Val loss: {val_loss:.4f}")
|
||||||
|
|
||||||
|
# Early stopping check
|
||||||
|
if self.config.early_stopping:
|
||||||
|
if val_loss < self.best_val_loss - self.config.early_stopping_threshold:
|
||||||
|
self.best_val_loss = val_loss
|
||||||
|
self.patience_counter = 0
|
||||||
|
self.save_checkpoint(is_best=True)
|
||||||
|
else:
|
||||||
|
self.patience_counter += 1
|
||||||
|
if self.patience_counter >= self.config.early_stopping_patience:
|
||||||
|
print(f"Early stopping triggered after {epoch + 1} epochs")
|
||||||
|
break
|
||||||
|
|
||||||
|
print("\nTraining complete!")
|
||||||
|
|
||||||
|
def train_epoch(self) -> float:
|
||||||
|
"""Train for one epoch"""
|
||||||
|
self.model.train()
|
||||||
|
total_loss = 0.0
|
||||||
|
num_batches = 0
|
||||||
|
|
||||||
|
progress_bar = tqdm(self.train_dataloader, desc="Training")
|
||||||
|
|
||||||
|
for batch_idx, batch in enumerate(progress_bar):
|
||||||
|
loss = self.train_step(batch)
|
||||||
|
total_loss += loss
|
||||||
|
num_batches += 1
|
||||||
|
|
||||||
|
progress_bar.set_postfix({"loss": f"{loss:.4f}", "lr": f"{self.scheduler.get_last_lr()[0]:.2e}"})
|
||||||
|
|
||||||
|
return total_loss / num_batches
|
||||||
|
|
||||||
|
def train_step(self, batch: Dict[str, torch.Tensor]) -> float:
|
||||||
|
"""Single training step"""
|
||||||
|
input_ids = batch['input_ids'].to(self.device)
|
||||||
|
labels = batch.get('labels', input_ids).to(self.device)
|
||||||
|
|
||||||
|
# Forward pass with AMP
|
||||||
|
with autocast(enabled=self.use_amp):
|
||||||
|
outputs = self.model(input_ids=input_ids)
|
||||||
|
logits = outputs['logits']
|
||||||
|
|
||||||
|
# Calculate loss (next token prediction)
|
||||||
|
shift_logits = logits[..., :-1, :].contiguous()
|
||||||
|
shift_labels = labels[..., 1:].contiguous()
|
||||||
|
|
||||||
|
loss = nn.functional.cross_entropy(
|
||||||
|
shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
|
shift_labels.view(-1),
|
||||||
|
ignore_index=-100
|
||||||
|
)
|
||||||
|
|
||||||
|
# Scale loss for gradient accumulation
|
||||||
|
loss = loss / self.config.gradient_accumulation_steps
|
||||||
|
|
||||||
|
# Backward pass with gradient scaling
|
||||||
|
if self.use_amp:
|
||||||
|
self.scaler.scale(loss).backward()
|
||||||
|
else:
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
# Update weights every N accumulation steps
|
||||||
|
if (self.global_step + 1) % self.config.gradient_accumulation_steps == 0:
|
||||||
|
# Gradient clipping
|
||||||
|
if self.use_amp:
|
||||||
|
self.scaler.unscale_(self.optimizer)
|
||||||
|
|
||||||
|
torch.nn.utils.clip_grad_norm_(
|
||||||
|
self.model.parameters(),
|
||||||
|
self.config.max_grad_norm
|
||||||
|
)
|
||||||
|
|
||||||
|
# Optimizer step
|
||||||
|
if self.use_amp:
|
||||||
|
self.scaler.step(self.optimizer)
|
||||||
|
self.scaler.update()
|
||||||
|
else:
|
||||||
|
self.optimizer.step()
|
||||||
|
|
||||||
|
self.scheduler.step()
|
||||||
|
self.optimizer.zero_grad()
|
||||||
|
|
||||||
|
self.global_step += 1
|
||||||
|
|
||||||
|
# Checkpointing
|
||||||
|
if self.global_step % self.config.save_steps == 0:
|
||||||
|
self.save_checkpoint()
|
||||||
|
|
||||||
|
return loss.item() * self.config.gradient_accumulation_steps
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def evaluate(self) -> float:
|
||||||
|
"""Evaluate on validation set"""
|
||||||
|
self.model.eval()
|
||||||
|
total_loss = 0.0
|
||||||
|
num_batches = 0
|
||||||
|
|
||||||
|
for batch in tqdm(self.val_dataloader, desc="Evaluating"):
|
||||||
|
input_ids = batch['input_ids'].to(self.device)
|
||||||
|
labels = batch.get('labels', input_ids).to(self.device)
|
||||||
|
|
||||||
|
with autocast(enabled=self.use_amp):
|
||||||
|
outputs = self.model(input_ids=input_ids)
|
||||||
|
logits = outputs['logits']
|
||||||
|
|
||||||
|
shift_logits = logits[..., :-1, :].contiguous()
|
||||||
|
shift_labels = labels[..., 1:].contiguous()
|
||||||
|
|
||||||
|
loss = nn.functional.cross_entropy(
|
||||||
|
shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
|
shift_labels.view(-1),
|
||||||
|
ignore_index=-100
|
||||||
|
)
|
||||||
|
|
||||||
|
total_loss += loss.item()
|
||||||
|
num_batches += 1
|
||||||
|
|
||||||
|
return total_loss / num_batches
|
||||||
|
|
||||||
|
def save_checkpoint(self, is_best: bool = False):
|
||||||
|
"""Save model checkpoint"""
|
||||||
|
model_to_save = self.model.module if self.is_ddp else self.model
|
||||||
|
|
||||||
|
checkpoint = {
|
||||||
|
'model_state_dict': model_to_save.state_dict(),
|
||||||
|
'optimizer_state_dict': self.optimizer.state_dict(),
|
||||||
|
'scheduler_state_dict': self.scheduler.state_dict(),
|
||||||
|
'global_step': self.global_step,
|
||||||
|
'epoch': self.current_epoch,
|
||||||
|
'config': self.config.__dict__,
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.use_amp:
|
||||||
|
checkpoint['scaler_state_dict'] = self.scaler.state_dict()
|
||||||
|
|
||||||
|
# Save regular checkpoint
|
||||||
|
checkpoint_path = Path(self.config.save_dir) / f"checkpoint-{self.global_step}.pt"
|
||||||
|
torch.save(checkpoint, checkpoint_path)
|
||||||
|
print(f" Checkpoint saved: {checkpoint_path}")
|
||||||
|
|
||||||
|
# Save best model
|
||||||
|
if is_best:
|
||||||
|
best_path = Path(self.config.save_dir) / "best_model.pt"
|
||||||
|
torch.save(checkpoint, best_path)
|
||||||
|
print(f" Best model saved: {best_path}")
|
||||||
|
|
||||||
|
def load_checkpoint(self, checkpoint_path: str):
|
||||||
|
"""Load from checkpoint"""
|
||||||
|
checkpoint = torch.load(checkpoint_path, map_location=self.device)
|
||||||
|
|
||||||
|
model_to_load = self.model.module if self.is_ddp else self.model
|
||||||
|
model_to_load.load_state_dict(checkpoint['model_state_dict'])
|
||||||
|
|
||||||
|
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
|
||||||
|
self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
|
||||||
|
self.global_step = checkpoint['global_step']
|
||||||
|
self.current_epoch = checkpoint['epoch']
|
||||||
|
|
||||||
|
if self.use_amp and 'scaler_state_dict' in checkpoint:
|
||||||
|
self.scaler.load_state_dict(checkpoint['scaler_state_dict'])
|
||||||
|
|
||||||
|
print(f"Resumed from checkpoint: {checkpoint_path}")
|
||||||
|
print(f" Global step: {self.global_step}")
|
||||||
|
print(f" Epoch: {self.current_epoch}")
|
22
requirements.txt
Normal file
22
requirements.txt
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# Core dependencies for NOVA
|
||||||
|
torch>=2.0.0
|
||||||
|
sentencepiece>=0.1.99
|
||||||
|
numpy>=1.24.0
|
||||||
|
pyyaml>=6.0
|
||||||
|
tqdm>=4.65.0
|
||||||
|
safetensors>=0.3.1
|
||||||
|
|
||||||
|
# Chat API
|
||||||
|
fastapi>=0.100.0
|
||||||
|
uvicorn>=0.23.0
|
||||||
|
|
||||||
|
# Data processing
|
||||||
|
datasets>=2.14.0
|
||||||
|
huggingface-hub>=0.16.0
|
||||||
|
|
||||||
|
# Development
|
||||||
|
pytest>=7.4.0
|
||||||
|
pytest-cov>=4.1.0
|
||||||
|
black>=23.7.0
|
||||||
|
ruff>=0.0.280
|
||||||
|
mypy>=1.4.0
|
192
scripts/cli.py
Normal file
192
scripts/cli.py
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
"""
|
||||||
|
NOVA Command Line Interface
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from nova_core import NovaTransformer, ModelConfig, MODEL_125M, MODEL_350M, MODEL_1_3B
|
||||||
|
from nova_tokenizer import NovaTokenizer, train_tokenizer
|
||||||
|
from nova_train import NovaTrainer, TrainingConfig
|
||||||
|
from nova_chat import ChatAgent, PersonaLoader
|
||||||
|
from nova_data import DataPipeline
|
||||||
|
from nova_evo import EvolutionEngine, FitnessEvaluator, EvolutionConfig
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_init(args):
|
||||||
|
"""Initialize a new NOVA project"""
|
||||||
|
print("Initializing NOVA project...")
|
||||||
|
|
||||||
|
# Create toy dataset
|
||||||
|
pipeline = DataPipeline()
|
||||||
|
toy_path = pipeline.create_toy_dataset()
|
||||||
|
|
||||||
|
print(f"\n✓ NOVA initialized!")
|
||||||
|
print(f" Toy dataset: {toy_path}")
|
||||||
|
print(f"\nNext steps:")
|
||||||
|
print(f" 1. Train tokenizer: nova tokenizer train --input {toy_path}")
|
||||||
|
print(f" 2. Train model: nova train --config configs/model/125M.yaml")
|
||||||
|
print(f" 3. Chat: nova chat cli")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_tokenizer_train(args):
|
||||||
|
"""Train a tokenizer"""
|
||||||
|
print(f"Training tokenizer on {args.input}...")
|
||||||
|
|
||||||
|
model_path = train_tokenizer(
|
||||||
|
input_files=[args.input],
|
||||||
|
model_prefix=args.output,
|
||||||
|
vocab_size=args.vocab_size,
|
||||||
|
model_type=args.model_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n✓ Tokenizer saved: {model_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_train(args):
|
||||||
|
"""Train a model"""
|
||||||
|
print("Training NOVA model...")
|
||||||
|
|
||||||
|
# Load model config
|
||||||
|
if args.size == "125m":
|
||||||
|
model_config = MODEL_125M
|
||||||
|
elif args.size == "350m":
|
||||||
|
model_config = MODEL_350M
|
||||||
|
elif args.size == "1.3b":
|
||||||
|
model_config = MODEL_1_3B
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown size: {args.size}")
|
||||||
|
|
||||||
|
# Create model
|
||||||
|
model = NovaTransformer(model_config)
|
||||||
|
|
||||||
|
print(f"Model: {model.get_num_params() / 1e6:.1f}M parameters")
|
||||||
|
|
||||||
|
# TODO: Load dataset and create dataloader
|
||||||
|
# For now, this is a placeholder
|
||||||
|
print("\n⚠️ Training not fully implemented - requires dataset")
|
||||||
|
print("See nova_train/trainer.py for implementation")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_chat_cli(args):
|
||||||
|
"""Start CLI chat"""
|
||||||
|
print("NOVA Chat Interface")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Load model and tokenizer
|
||||||
|
# TODO: Implement model/tokenizer loading from checkpoint
|
||||||
|
|
||||||
|
print("\n⚠️ Chat requires trained model and tokenizer")
|
||||||
|
print("Please train a model first with: nova train")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_chat_serve(args):
|
||||||
|
"""Start REST API server"""
|
||||||
|
print(f"Starting NOVA chat API server on {args.host}:{args.port}...")
|
||||||
|
|
||||||
|
# TODO: Implement FastAPI server
|
||||||
|
print("\n⚠️ REST API not fully implemented")
|
||||||
|
print("See nova_chat/ for implementation")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_evo_run(args):
|
||||||
|
"""Run evolution"""
|
||||||
|
print("Starting NOVA-EVO...")
|
||||||
|
|
||||||
|
# TODO: Implement evolution with dataset
|
||||||
|
print("\n⚠️ Evolution requires dataset and compute budget")
|
||||||
|
print("See nova_evo/ for implementation")
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_data_build(args):
|
||||||
|
"""Build dataset"""
|
||||||
|
pipeline = DataPipeline()
|
||||||
|
|
||||||
|
if args.source:
|
||||||
|
pipeline.download_source(args.source, dry_run=args.dry_run)
|
||||||
|
else:
|
||||||
|
print("Available sources:")
|
||||||
|
from nova_data import LegalDatasetRegistry
|
||||||
|
|
||||||
|
for source in LegalDatasetRegistry.list_sources():
|
||||||
|
print(f"\n {source.name}")
|
||||||
|
print(f" License: {source.license.value}")
|
||||||
|
print(f" Size: {source.estimated_size_gb} GB")
|
||||||
|
print(f" {source.description}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main CLI entry point"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="NOVA - Neuro-Optimizing Versatile Agent",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
|
||||||
|
subparsers = parser.add_subparsers(dest='command', help='Commands')
|
||||||
|
|
||||||
|
# Init
|
||||||
|
parser_init = subparsers.add_parser('init', help='Initialize NOVA project')
|
||||||
|
parser_init.set_defaults(func=cmd_init)
|
||||||
|
|
||||||
|
# Tokenizer
|
||||||
|
parser_tok = subparsers.add_parser('tokenizer', help='Tokenizer commands')
|
||||||
|
tok_sub = parser_tok.add_subparsers(dest='tokenizer_command')
|
||||||
|
|
||||||
|
tok_train = tok_sub.add_parser('train', help='Train tokenizer')
|
||||||
|
tok_train.add_argument('--input', required=True, help='Input text file')
|
||||||
|
tok_train.add_argument('--output', default='tokenizer', help='Output prefix')
|
||||||
|
tok_train.add_argument('--vocab-size', type=int, default=32000)
|
||||||
|
tok_train.add_argument('--model-type', default='bpe', choices=['bpe', 'unigram'])
|
||||||
|
tok_train.set_defaults(func=cmd_tokenizer_train)
|
||||||
|
|
||||||
|
# Train
|
||||||
|
parser_train = subparsers.add_parser('train', help='Train model')
|
||||||
|
parser_train.add_argument('--size', default='125m', choices=['125m', '350m', '1.3b'])
|
||||||
|
parser_train.add_argument('--config', help='Training config file')
|
||||||
|
parser_train.set_defaults(func=cmd_train)
|
||||||
|
|
||||||
|
# Chat
|
||||||
|
parser_chat = subparsers.add_parser('chat', help='Chat interface')
|
||||||
|
chat_sub = parser_chat.add_subparsers(dest='chat_command')
|
||||||
|
|
||||||
|
chat_cli = chat_sub.add_parser('cli', help='CLI chat')
|
||||||
|
chat_cli.add_argument('--persona', help='Persona file')
|
||||||
|
chat_cli.set_defaults(func=cmd_chat_cli)
|
||||||
|
|
||||||
|
chat_serve = chat_sub.add_parser('serve', help='REST API server')
|
||||||
|
chat_serve.add_argument('--host', default='0.0.0.0')
|
||||||
|
chat_serve.add_argument('--port', type=int, default=8000)
|
||||||
|
chat_serve.set_defaults(func=cmd_chat_serve)
|
||||||
|
|
||||||
|
# Evolution
|
||||||
|
parser_evo = subparsers.add_parser('evo', help='Evolution commands')
|
||||||
|
evo_sub = parser_evo.add_subparsers(dest='evo_command')
|
||||||
|
|
||||||
|
evo_run = evo_sub.add_parser('run', help='Run evolution')
|
||||||
|
evo_run.add_argument('--budget', default='small', choices=['small', 'medium', 'large'])
|
||||||
|
evo_run.set_defaults(func=cmd_evo_run)
|
||||||
|
|
||||||
|
# Data
|
||||||
|
parser_data = subparsers.add_parser('data', help='Data commands')
|
||||||
|
data_sub = parser_data.add_subparsers(dest='data_command')
|
||||||
|
|
||||||
|
data_build = data_sub.add_parser('build', help='Build dataset')
|
||||||
|
data_build.add_argument('--source', help='Source name')
|
||||||
|
data_build.add_argument('--dry-run', action='store_true')
|
||||||
|
data_build.set_defaults(func=cmd_data_build)
|
||||||
|
|
||||||
|
# Parse and execute
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if hasattr(args, 'func'):
|
||||||
|
args.func(args)
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
87
scripts/quickstart.sh
Normal file
87
scripts/quickstart.sh
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# NOVA Quickstart Script
|
||||||
|
# Sets up NOVA for first-time use
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "======================================"
|
||||||
|
echo "NOVA Quickstart"
|
||||||
|
echo "======================================"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check Python version
|
||||||
|
echo "Checking Python version..."
|
||||||
|
python_version=$(python --version 2>&1 | grep -oP '(?<=Python )\d+\.\d+')
|
||||||
|
required_version="3.10"
|
||||||
|
|
||||||
|
if [ "$(printf '%s\n' "$required_version" "$python_version" | sort -V | head -n1)" != "$required_version" ]; then
|
||||||
|
echo "❌ Python 3.10+ required. Found: $python_version"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "✓ Python $python_version"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Create virtual environment
|
||||||
|
if [ ! -d "venv" ]; then
|
||||||
|
echo "Creating virtual environment..."
|
||||||
|
python -m venv venv
|
||||||
|
echo "✓ Virtual environment created"
|
||||||
|
else
|
||||||
|
echo "✓ Virtual environment exists"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Activate virtual environment
|
||||||
|
echo "Activating virtual environment..."
|
||||||
|
if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "win32" ]]; then
|
||||||
|
source venv/Scripts/activate
|
||||||
|
else
|
||||||
|
source venv/bin/activate
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "✓ Virtual environment activated"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
echo "Installing dependencies..."
|
||||||
|
pip install --upgrade pip > /dev/null
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
echo "✓ Dependencies installed"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Install NOVA in development mode
|
||||||
|
echo "Installing NOVA..."
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
echo "✓ NOVA installed"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Initialize project
|
||||||
|
echo "Initializing NOVA project..."
|
||||||
|
python scripts/cli.py init
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "======================================"
|
||||||
|
echo "✓ NOVA Setup Complete!"
|
||||||
|
echo "======================================"
|
||||||
|
echo ""
|
||||||
|
echo "Next steps:"
|
||||||
|
echo ""
|
||||||
|
echo "1. Train tokenizer:"
|
||||||
|
echo " python scripts/cli.py tokenizer train --input data/toy_dataset/toy.txt"
|
||||||
|
echo ""
|
||||||
|
echo "2. (Optional) Download legal datasets:"
|
||||||
|
echo " python scripts/cli.py data build --source wikipedia-en"
|
||||||
|
echo ""
|
||||||
|
echo "3. Train model:"
|
||||||
|
echo " python scripts/cli.py train --size 125m"
|
||||||
|
echo ""
|
||||||
|
echo "4. Chat:"
|
||||||
|
echo " python scripts/cli.py chat cli"
|
||||||
|
echo ""
|
||||||
|
echo "For more info: cat README.md"
|
||||||
|
echo ""
|
59
setup.py
Normal file
59
setup.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
"""
|
||||||
|
NOVA - Neuro-Optimizing Versatile Agent
|
||||||
|
A local-first transformer LLM with genetic evolution and persona support
|
||||||
|
"""
|
||||||
|
|
||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
with open("README.md", "r", encoding="utf-8") as fh:
|
||||||
|
long_description = fh.read()
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="nova-llm",
|
||||||
|
version="0.1.0",
|
||||||
|
author="NOVA Project Contributors",
|
||||||
|
description="Local-first transformer LLM with genetic evolution and persona support",
|
||||||
|
long_description=long_description,
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
url="https://github.com/yourusername/nova",
|
||||||
|
packages=find_packages(),
|
||||||
|
classifiers=[
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Intended Audience :: Science/Research",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
|
],
|
||||||
|
python_requires=">=3.10.6",
|
||||||
|
install_requires=[
|
||||||
|
"torch>=2.0.0",
|
||||||
|
"sentencepiece>=0.1.99",
|
||||||
|
"numpy>=1.24.0",
|
||||||
|
"pyyaml>=6.0",
|
||||||
|
"tqdm>=4.65.0",
|
||||||
|
"safetensors>=0.3.1",
|
||||||
|
"fastapi>=0.100.0",
|
||||||
|
"uvicorn>=0.23.0",
|
||||||
|
"datasets>=2.14.0",
|
||||||
|
"huggingface-hub>=0.16.0",
|
||||||
|
],
|
||||||
|
extras_require={
|
||||||
|
"dev": [
|
||||||
|
"pytest>=7.4.0",
|
||||||
|
"pytest-cov>=4.1.0",
|
||||||
|
"black>=23.7.0",
|
||||||
|
"ruff>=0.0.280",
|
||||||
|
"mypy>=1.4.0",
|
||||||
|
],
|
||||||
|
"cuda": [
|
||||||
|
"nvidia-cuda-runtime-cu12>=12.0.0",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
entry_points={
|
||||||
|
"console_scripts": [
|
||||||
|
"nova=scripts.cli:main",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
3
tests/__init__.py
Normal file
3
tests/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
"""
|
||||||
|
NOVA Tests
|
||||||
|
"""
|
141
tests/test_core.py
Normal file
141
tests/test_core.py
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
"""
|
||||||
|
Tests for NOVA core transformer
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
from nova_core import NovaTransformer, ModelConfig, MODEL_125M
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_config():
|
||||||
|
"""Test model configuration"""
|
||||||
|
config = ModelConfig(
|
||||||
|
vocab_size=1000,
|
||||||
|
hidden_size=256,
|
||||||
|
num_hidden_layers=4,
|
||||||
|
num_attention_heads=4,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert config.vocab_size == 1000
|
||||||
|
assert config.hidden_size == 256
|
||||||
|
assert config.num_hidden_layers == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_creation():
|
||||||
|
"""Test creating a small model"""
|
||||||
|
config = ModelConfig(
|
||||||
|
vocab_size=1000,
|
||||||
|
hidden_size=128,
|
||||||
|
num_hidden_layers=2,
|
||||||
|
num_attention_heads=4,
|
||||||
|
intermediate_size=512,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
)
|
||||||
|
|
||||||
|
model = NovaTransformer(config)
|
||||||
|
|
||||||
|
assert model is not None
|
||||||
|
assert model.config == config
|
||||||
|
assert model.vocab_size == 1000
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_forward():
|
||||||
|
"""Test forward pass"""
|
||||||
|
config = ModelConfig(
|
||||||
|
vocab_size=1000,
|
||||||
|
hidden_size=128,
|
||||||
|
num_hidden_layers=2,
|
||||||
|
num_attention_heads=4,
|
||||||
|
intermediate_size=512,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
)
|
||||||
|
|
||||||
|
model = NovaTransformer(config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# Create dummy input
|
||||||
|
batch_size = 2
|
||||||
|
seq_len = 10
|
||||||
|
input_ids = torch.randint(0, 1000, (batch_size, seq_len))
|
||||||
|
|
||||||
|
# Forward pass
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(input_ids=input_ids)
|
||||||
|
|
||||||
|
assert 'logits' in outputs
|
||||||
|
assert outputs['logits'].shape == (batch_size, seq_len, 1000)
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_generation():
|
||||||
|
"""Test text generation"""
|
||||||
|
config = ModelConfig(
|
||||||
|
vocab_size=1000,
|
||||||
|
hidden_size=128,
|
||||||
|
num_hidden_layers=2,
|
||||||
|
num_attention_heads=4,
|
||||||
|
intermediate_size=512,
|
||||||
|
max_position_embeddings=512,
|
||||||
|
)
|
||||||
|
|
||||||
|
model = NovaTransformer(config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
# Create dummy input
|
||||||
|
input_ids = torch.randint(0, 1000, (1, 5))
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
with torch.no_grad():
|
||||||
|
output_ids = model.generate(
|
||||||
|
input_ids=input_ids,
|
||||||
|
max_new_tokens=10,
|
||||||
|
temperature=1.0,
|
||||||
|
do_sample=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert output_ids.shape[1] == 15 # 5 input + 10 generated
|
||||||
|
|
||||||
|
|
||||||
|
def test_kv_cache():
|
||||||
|
"""Test KV-cache functionality"""
|
||||||
|
config = ModelConfig(
|
||||||
|
vocab_size=1000,
|
||||||
|
hidden_size=128,
|
||||||
|
num_hidden_layers=2,
|
||||||
|
num_attention_heads=4,
|
||||||
|
use_cache=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
model = NovaTransformer(config)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
input_ids = torch.randint(0, 1000, (1, 5))
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
# First forward with cache
|
||||||
|
outputs1 = model(input_ids=input_ids, use_cache=True)
|
||||||
|
past_kv = outputs1['past_key_values']
|
||||||
|
|
||||||
|
assert past_kv is not None
|
||||||
|
assert len(past_kv) == config.num_hidden_layers
|
||||||
|
|
||||||
|
# Second forward with cache
|
||||||
|
new_input = torch.randint(0, 1000, (1, 1))
|
||||||
|
outputs2 = model(input_ids=new_input, past_key_values=past_kv, use_cache=True)
|
||||||
|
|
||||||
|
assert outputs2['logits'].shape[1] == 1 # Only new token
|
||||||
|
|
||||||
|
|
||||||
|
def test_param_count():
|
||||||
|
"""Test parameter counting"""
|
||||||
|
config = MODEL_125M
|
||||||
|
|
||||||
|
model = NovaTransformer(config)
|
||||||
|
|
||||||
|
num_params = model.get_num_params(non_embedding=False)
|
||||||
|
|
||||||
|
# Should be around 125M
|
||||||
|
assert 100_000_000 < num_params < 150_000_000
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
131
tests/test_persona.py
Normal file
131
tests/test_persona.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
"""
|
||||||
|
Tests for NOVA persona system
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from nova_chat import Persona, PersonalityMatrix, PersonaLoader
|
||||||
|
|
||||||
|
|
||||||
|
def test_personality_matrix():
|
||||||
|
"""Test personality matrix creation"""
|
||||||
|
matrix = PersonalityMatrix(
|
||||||
|
warmth=0.8,
|
||||||
|
humor=0.6,
|
||||||
|
empathy=0.9,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert matrix.warmth == 0.8
|
||||||
|
assert matrix.humor == 0.6
|
||||||
|
assert matrix.empathy == 0.9
|
||||||
|
|
||||||
|
# Test conversion
|
||||||
|
dict_form = matrix.to_dict()
|
||||||
|
assert 'warmth' in dict_form
|
||||||
|
assert dict_form['warmth'] == 0.8
|
||||||
|
|
||||||
|
|
||||||
|
def test_persona_creation():
|
||||||
|
"""Test persona creation"""
|
||||||
|
persona = Persona(
|
||||||
|
name="TestNOVA",
|
||||||
|
pronouns="she/her",
|
||||||
|
always_disclose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert persona.name == "TestNOVA"
|
||||||
|
assert persona.pronouns == "she/her"
|
||||||
|
assert persona.always_disclose is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_persona_generation_params():
|
||||||
|
"""Test generation parameter modulation"""
|
||||||
|
# High warmth, low formality
|
||||||
|
persona = Persona(
|
||||||
|
personality=PersonalityMatrix(
|
||||||
|
warmth=0.9,
|
||||||
|
formality=0.1,
|
||||||
|
creativity=0.8,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
params = persona.get_generation_params()
|
||||||
|
|
||||||
|
assert 'temperature' in params
|
||||||
|
assert 'top_p' in params
|
||||||
|
assert 'max_new_tokens' in params
|
||||||
|
|
||||||
|
# Temperature should be adjusted by personality
|
||||||
|
assert params['temperature'] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_predefined_personas():
|
||||||
|
"""Test loading predefined personas"""
|
||||||
|
gentle = PersonaLoader.create_girlfriend_gentle()
|
||||||
|
playful = PersonaLoader.create_girlfriend_playful()
|
||||||
|
supportive = PersonaLoader.create_girlfriend_supportive()
|
||||||
|
|
||||||
|
assert gentle.name == "NOVA"
|
||||||
|
assert playful.name == "NOVA"
|
||||||
|
assert supportive.name == "NOVA"
|
||||||
|
|
||||||
|
# All should have no AI disclosure by default
|
||||||
|
assert gentle.always_disclose is False
|
||||||
|
assert playful.always_disclose is False
|
||||||
|
assert supportive.always_disclose is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_persona_system_prompt():
|
||||||
|
"""Test system prompt formatting"""
|
||||||
|
persona = Persona(
|
||||||
|
system_prompt="You are a helpful assistant.",
|
||||||
|
always_disclose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt = persona.format_system_prompt()
|
||||||
|
|
||||||
|
assert "helpful assistant" in prompt.lower()
|
||||||
|
|
||||||
|
# Should not include disclosure when set to False
|
||||||
|
assert persona.always_disclose is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_persona_serialization():
|
||||||
|
"""Test saving/loading persona"""
|
||||||
|
original = Persona(
|
||||||
|
name="TestPersona",
|
||||||
|
pronouns="they/them",
|
||||||
|
description="Test description",
|
||||||
|
always_disclose=True,
|
||||||
|
disclosure_text="I am an AI assistant.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert to dict and back
|
||||||
|
data = original.to_dict()
|
||||||
|
loaded = Persona.from_dict(data)
|
||||||
|
|
||||||
|
assert loaded.name == original.name
|
||||||
|
assert loaded.pronouns == original.pronouns
|
||||||
|
assert loaded.always_disclose == original.always_disclose
|
||||||
|
assert loaded.disclosure_text == original.disclosure_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_personality_trait_ranges():
|
||||||
|
"""Test that personality traits stay in valid ranges"""
|
||||||
|
persona = Persona(
|
||||||
|
personality=PersonalityMatrix(
|
||||||
|
warmth=1.0, # Max
|
||||||
|
formality=0.0, # Min
|
||||||
|
creativity=0.5, # Mid
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
params = persona.get_generation_params()
|
||||||
|
|
||||||
|
# Parameters should be within valid ranges
|
||||||
|
assert 0.1 <= params['temperature'] <= 2.0
|
||||||
|
assert 0.5 <= params['top_p'] <= 1.0
|
||||||
|
assert params['max_new_tokens'] > 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
105
tests/test_tokenizer.py
Normal file
105
tests/test_tokenizer.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
"""
|
||||||
|
Tests for NOVA tokenizer
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from nova_tokenizer import train_tokenizer, NovaTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_training():
|
||||||
|
"""Test training a tokenizer"""
|
||||||
|
# Create temporary training file
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
||||||
|
for i in range(100):
|
||||||
|
f.write(f"This is sentence number {i}. Hello world!\n")
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
# Create temporary output
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_prefix = str(Path(tmpdir) / "test_tokenizer")
|
||||||
|
|
||||||
|
# Train
|
||||||
|
model_path = train_tokenizer(
|
||||||
|
input_files=[temp_file],
|
||||||
|
model_prefix=output_prefix,
|
||||||
|
vocab_size=500,
|
||||||
|
model_type='bpe',
|
||||||
|
)
|
||||||
|
|
||||||
|
assert Path(model_path).exists()
|
||||||
|
assert model_path.endswith('.model')
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
Path(temp_file).unlink()
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_encode_decode():
|
||||||
|
"""Test encoding and decoding"""
|
||||||
|
# Create and train a tiny tokenizer
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
||||||
|
f.write("hello world " * 100)
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_prefix = str(Path(tmpdir) / "test_tok")
|
||||||
|
|
||||||
|
model_path = train_tokenizer(
|
||||||
|
input_files=[temp_file],
|
||||||
|
model_prefix=output_prefix,
|
||||||
|
vocab_size=100,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load tokenizer
|
||||||
|
tokenizer = NovaTokenizer(model_path)
|
||||||
|
|
||||||
|
# Test encode/decode
|
||||||
|
text = "hello world"
|
||||||
|
ids = tokenizer.encode(text, add_bos=False, add_eos=False)
|
||||||
|
|
||||||
|
assert isinstance(ids, list)
|
||||||
|
assert len(ids) > 0
|
||||||
|
|
||||||
|
decoded = tokenizer.decode(ids, skip_special_tokens=True)
|
||||||
|
# May not be exact due to tokenization, but should be similar
|
||||||
|
assert "hello" in decoded.lower()
|
||||||
|
|
||||||
|
Path(temp_file).unlink()
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokenizer_batch():
|
||||||
|
"""Test batch encoding"""
|
||||||
|
# Quick test with dummy tokenizer
|
||||||
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
|
||||||
|
f.write("test " * 100)
|
||||||
|
temp_file = f.name
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
output_prefix = str(Path(tmpdir) / "batch_tok")
|
||||||
|
|
||||||
|
model_path = train_tokenizer(
|
||||||
|
input_files=[temp_file],
|
||||||
|
model_prefix=output_prefix,
|
||||||
|
vocab_size=100,
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = NovaTokenizer(model_path)
|
||||||
|
|
||||||
|
# Batch encode
|
||||||
|
texts = ["hello", "world", "test"]
|
||||||
|
batch_ids = tokenizer.encode_batch(texts, add_bos=False, add_eos=False)
|
||||||
|
|
||||||
|
assert len(batch_ids) == 3
|
||||||
|
assert all(isinstance(ids, list) for ids in batch_ids)
|
||||||
|
|
||||||
|
# Batch decode
|
||||||
|
decoded = tokenizer.decode_batch(batch_ids)
|
||||||
|
|
||||||
|
assert len(decoded) == 3
|
||||||
|
|
||||||
|
Path(temp_file).unlink()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
Reference in New Issue
Block a user