""" NOVA Command Line Interface """ import argparse import sys from pathlib import Path # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent.parent)) from nova_core import NovaTransformer, ModelConfig, MODEL_125M, MODEL_350M, MODEL_1_3B from nova_tokenizer import NovaTokenizer, train_tokenizer from nova_train import NovaTrainer, TrainingConfig from nova_chat import ChatAgent, PersonaLoader from nova_data import DataPipeline from nova_evo import EvolutionEngine, FitnessEvaluator, EvolutionConfig def cmd_init(args): """Initialize a new NOVA project""" print("Initializing NOVA project...") # Create toy dataset pipeline = DataPipeline() toy_path = pipeline.create_toy_dataset() print(f"\n✓ NOVA initialized!") print(f" Toy dataset: {toy_path}") print(f"\nNext steps:") print(f" 1. Train tokenizer: nova tokenizer train --input {toy_path}") print(f" 2. Train model: nova train --config configs/model/125M.yaml") print(f" 3. Chat: nova chat cli") def cmd_tokenizer_train(args): """Train a tokenizer""" print(f"Training tokenizer on {args.input}...") model_path = train_tokenizer( input_files=[args.input], model_prefix=args.output, vocab_size=args.vocab_size, model_type=args.model_type, ) print(f"\n✓ Tokenizer saved: {model_path}") def cmd_train(args): """Train a model""" print("Training NOVA model...") # Load model config if args.size == "125m": model_config = MODEL_125M elif args.size == "350m": model_config = MODEL_350M elif args.size == "1.3b": model_config = MODEL_1_3B else: raise ValueError(f"Unknown size: {args.size}") # Create model model = NovaTransformer(model_config) print(f"Model: {model.get_num_params() / 1e6:.1f}M parameters") # TODO: Load dataset and create dataloader # For now, this is a placeholder print("\n⚠️ Training not fully implemented - requires dataset") print("See nova_train/trainer.py for implementation") def cmd_chat_cli(args): """Start CLI chat""" print("NOVA Chat Interface") print("=" * 60) # Load model and tokenizer # TODO: Implement model/tokenizer loading from checkpoint print("\n⚠️ Chat requires trained model and tokenizer") print("Please train a model first with: nova train") def cmd_chat_serve(args): """Start REST API server""" print(f"Starting NOVA chat API server on {args.host}:{args.port}...") # TODO: Implement FastAPI server print("\n⚠️ REST API not fully implemented") print("See nova_chat/ for implementation") def cmd_evo_run(args): """Run evolution""" print("Starting NOVA-EVO...") # TODO: Implement evolution with dataset print("\n⚠️ Evolution requires dataset and compute budget") print("See nova_evo/ for implementation") def cmd_data_build(args): """Build dataset""" pipeline = DataPipeline() if args.source: pipeline.download_source(args.source, dry_run=args.dry_run) else: print("Available sources:") from nova_data import LegalDatasetRegistry for source in LegalDatasetRegistry.list_sources(): print(f"\n {source.name}") print(f" License: {source.license.value}") print(f" Size: {source.estimated_size_gb} GB") print(f" {source.description}") def main(): """Main CLI entry point""" parser = argparse.ArgumentParser( description="NOVA - Neuro-Optimizing Versatile Agent", formatter_class=argparse.RawDescriptionHelpFormatter, ) subparsers = parser.add_subparsers(dest='command', help='Commands') # Init parser_init = subparsers.add_parser('init', help='Initialize NOVA project') parser_init.set_defaults(func=cmd_init) # Tokenizer parser_tok = subparsers.add_parser('tokenizer', help='Tokenizer commands') tok_sub = parser_tok.add_subparsers(dest='tokenizer_command') tok_train = tok_sub.add_parser('train', help='Train tokenizer') tok_train.add_argument('--input', required=True, help='Input text file') tok_train.add_argument('--output', default='tokenizer', help='Output prefix') tok_train.add_argument('--vocab-size', type=int, default=32000) tok_train.add_argument('--model-type', default='bpe', choices=['bpe', 'unigram']) tok_train.set_defaults(func=cmd_tokenizer_train) # Train parser_train = subparsers.add_parser('train', help='Train model') parser_train.add_argument('--size', default='125m', choices=['125m', '350m', '1.3b']) parser_train.add_argument('--config', help='Training config file') parser_train.set_defaults(func=cmd_train) # Chat parser_chat = subparsers.add_parser('chat', help='Chat interface') chat_sub = parser_chat.add_subparsers(dest='chat_command') chat_cli = chat_sub.add_parser('cli', help='CLI chat') chat_cli.add_argument('--persona', help='Persona file') chat_cli.set_defaults(func=cmd_chat_cli) chat_serve = chat_sub.add_parser('serve', help='REST API server') chat_serve.add_argument('--host', default='0.0.0.0') chat_serve.add_argument('--port', type=int, default=8000) chat_serve.set_defaults(func=cmd_chat_serve) # Evolution parser_evo = subparsers.add_parser('evo', help='Evolution commands') evo_sub = parser_evo.add_subparsers(dest='evo_command') evo_run = evo_sub.add_parser('run', help='Run evolution') evo_run.add_argument('--budget', default='small', choices=['small', 'medium', 'large']) evo_run.set_defaults(func=cmd_evo_run) # Data parser_data = subparsers.add_parser('data', help='Data commands') data_sub = parser_data.add_subparsers(dest='data_command') data_build = data_sub.add_parser('build', help='Build dataset') data_build.add_argument('--source', help='Source name') data_build.add_argument('--dry-run', action='store_true') data_build.set_defaults(func=cmd_data_build) # Parse and execute args = parser.parse_args() if hasattr(args, 'func'): args.func(args) else: parser.print_help() if __name__ == '__main__': main()