""" Tests for knowledge acquisition and processing systems. """ import pytest import asyncio from pathlib import Path from unittest.mock import Mock, AsyncMock, patch import json from lyra.knowledge.gutenberg_crawler import GutenbergCrawler, GutenbergBook from lyra.knowledge.knowledge_processor import KnowledgeProcessor, ProcessedKnowledge from tests.conftest import create_mock_response class TestGutenbergBook: """Tests for Gutenberg book representation.""" def test_gutenberg_book_initialization(self, sample_gutenberg_book): """Test Gutenberg book initialization.""" book = sample_gutenberg_book assert book.id == 12345 assert book.title == "Sample Public Domain Book" assert book.author == "Test Author" assert book.language == "en" assert book.category == "Fiction" assert book.copyright_status == "public_domain" assert book.quality_score == 0.8 assert book.metadata is not None def test_gutenberg_book_post_init(self): """Test book post-initialization.""" book = GutenbergBook( id=1, title="Test", author="Author", language="en", category="Test", url="http://test.com", file_format="txt", download_url="http://test.com/file.txt" ) assert book.metadata == {} # Should initialize empty dict class TestGutenbergCrawler: """Tests for the Gutenberg crawler.""" def test_crawler_initialization(self): """Test crawler initialization.""" crawler = GutenbergCrawler( base_url="https://www.gutenberg.org", rate_limit=1.0, max_concurrent=2 ) assert crawler.base_url == "https://www.gutenberg.org" assert crawler.rate_limit == 1.0 assert crawler.max_concurrent == 2 assert len(crawler.crawled_books) == 0 assert len(crawler.failed_downloads) == 0 @pytest.mark.asyncio async def test_crawler_async_context_manager(self): """Test crawler as async context manager.""" with patch.object(GutenbergCrawler, '_verify_gutenberg_access', new_callable=AsyncMock): async with GutenbergCrawler() as crawler: assert crawler.session is not None @pytest.mark.asyncio async def test_book_details_extraction(self): """Test extraction of book details.""" crawler = GutenbergCrawler() # Mock HTML content mock_html = """ Test Book Test Author Language: English Download TXT """ with patch.object(crawler, '_rate_limited_request') as mock_request: mock_response = Mock() mock_response.status = 200 mock_response.text = AsyncMock(return_value=mock_html) mock_request.return_value = mock_response book = await crawler._get_book_details(123, "Test Book", "Fiction") assert book is not None assert book.id == 123 assert book.title == "Test Book" assert book.category == "Fiction" def test_download_appropriateness_check(self, sample_gutenberg_book): """Test checking if a book is appropriate for download.""" crawler = GutenbergCrawler() # Should be appropriate (public domain, allowed format) assert crawler._is_download_appropriate(sample_gutenberg_book) is True # Test with excluded language crawler.excluded_languages = ['en'] assert crawler._is_download_appropriate(sample_gutenberg_book) is False # Test with disallowed format crawler.excluded_languages = [] sample_gutenberg_book.file_format = 'pdf' assert crawler._is_download_appropriate(sample_gutenberg_book) is False # Test with non-public domain sample_gutenberg_book.file_format = 'txt' sample_gutenberg_book.copyright_status = 'copyrighted' assert crawler._is_download_appropriate(sample_gutenberg_book) is False @pytest.mark.asyncio async def test_legal_validation(self, sample_gutenberg_book): """Test legal status validation.""" crawler = GutenbergCrawler() # Public domain book should be valid is_valid = await crawler.validate_legal_status(sample_gutenberg_book) assert is_valid is True # Test with non-public domain sample_gutenberg_book.copyright_status = "copyrighted" is_valid = await crawler.validate_legal_status(sample_gutenberg_book) assert is_valid is True # Still returns True for Gutenberg books def test_file_format_determination(self): """Test file format determination from URL.""" crawler = GutenbergCrawler() test_cases = [ ("http://example.com/book.txt", "txt"), ("http://example.com/book.html", "html"), ("http://example.com/book.epub", "epub"), ("http://example.com/book", "txt") # Default ] for url, expected_format in test_cases: result = crawler._determine_file_format(url) assert result == expected_format def test_download_statistics(self): """Test download statistics generation.""" crawler = GutenbergCrawler() # Add some mock data book1 = GutenbergBook(1, "Book 1", "Author 1", "en", "Fiction", "url1", "txt", "download1", quality_score=0.8) book2 = GutenbergBook(2, "Book 2", "Author 2", "fr", "Science", "url2", "html", "download2", quality_score=0.9) crawler.crawled_books = {1: book1, 2: book2} crawler.failed_downloads = [3, 4] stats = crawler.get_download_statistics() assert stats['total_discovered'] == 2 assert stats['failed_downloads'] == 2 assert stats['success_rate'] == 0.5 # 2 success, 2 failures assert 'en' in stats['languages_discovered'] assert 'fr' in stats['languages_discovered'] assert 'Fiction' in stats['categories_discovered'] assert 'Science' in stats['categories_discovered'] @pytest.mark.asyncio async def test_book_recommendations(self): """Test book recommendation generation.""" crawler = GutenbergCrawler() with patch.object(crawler, '_discover_books_in_category') as mock_discover: async def mock_generator(category, languages): if category == "Science": yield GutenbergBook(1, "Science Book", "Author", "en", "Science", "url", "txt", "download") mock_discover.return_value = mock_generator("Science", ["en"]) recommendations = await crawler.get_book_recommendations( interests=['science'], limit=5 ) assert len(recommendations) >= 0 # May be empty due to mocking class TestKnowledgeProcessor: """Tests for knowledge processing system.""" @pytest.mark.asyncio async def test_processor_initialization(self, device): """Test knowledge processor initialization.""" processor = KnowledgeProcessor( device=device, chunk_size=256, chunk_overlap=25 ) assert processor.device == device assert processor.chunk_size == 256 assert processor.chunk_overlap == 25 assert processor.nlp is None # Loaded lazily @pytest.mark.asyncio async def test_text_cleaning(self): """Test text cleaning functionality.""" processor = KnowledgeProcessor() # Test text with common Gutenberg artifacts dirty_text = """ *** START OF THE PROJECT GUTENBERG EBOOK TEST *** This is the actual content. It has multiple spaces. And multiple newlines. *** END OF THE PROJECT GUTENBERG EBOOK TEST *** """ cleaned = await processor._clean_text(dirty_text) assert "*** START OF" not in cleaned assert "*** END OF" not in cleaned assert "multiple spaces" not in cleaned assert cleaned.count('\n\n\n') == 0 # No triple newlines @pytest.mark.asyncio async def test_title_extraction(self): """Test title extraction from content and filename.""" processor = KnowledgeProcessor() # Test with content containing title content_with_title = """ THE GREAT WORK Chapter 1 This is the beginning of the story... """ title = await processor._extract_title(content_with_title, "test_file.txt") assert "GREAT WORK" in title # Test with filename fallback title = await processor._extract_title("No clear title here", "12345_the_book_title.txt") assert "Book Title" in title def test_chunk_type_determination(self): """Test text chunk type determination.""" processor = KnowledgeProcessor() test_cases = [ ("Short text", "short_paragraph"), ("Chapter 1: Introduction", "section_header"), ("This is a normal paragraph with sufficient length to be classified properly.", "paragraph"), ("List of items:", "list_header") ] for text, expected_type in test_cases: result = processor._determine_chunk_type(text) assert result == expected_type @pytest.mark.asyncio async def test_quality_score_calculation(self): """Test content quality score calculation.""" processor = KnowledgeProcessor() # High quality content high_quality = """ This is a well-researched scientific study that presents important findings based on rigorous analysis. The research methodology was peer-reviewed and published in an academic journal. The results show significant evidence for the hypothesis tested. """ * 10 # Make it longer quality = await processor._calculate_quality_score(high_quality, "Scientific Research Study") assert quality > 0.5 # Lower quality content low_quality = "unverified rumor gossip speculation fake news" quality = await processor._calculate_quality_score(low_quality, "Gossip") assert quality < 0.5 @pytest.mark.asyncio async def test_category_classification(self): """Test content category classification.""" processor = KnowledgeProcessor() # Science content science_content = """ This research examines the quantum mechanics of particle physics. The experiment was conducted using advanced scientific methods to test the hypothesis about atomic behavior. """ category, subcategory = await processor._classify_content( science_content, "Quantum Physics Research" ) assert category == "science" # History content history_content = """ The ancient Roman Empire was a vast civilization that dominated the Mediterranean world for centuries. The empire's military conquests and cultural achievements shaped history. """ category, subcategory = await processor._classify_content( history_content, "Roman Empire History" ) assert category == "history" @pytest.mark.asyncio async def test_complexity_score_calculation(self): """Test complexity score calculation.""" processor = KnowledgeProcessor() # Simple text simple_text = "This is easy to read. The words are simple. Anyone can understand this." complexity = await processor._calculate_complexity_score(simple_text) assert 0.0 <= complexity <= 1.0 # Complex text complex_text = """ The epistemological ramifications of phenomenological investigations require sophisticated methodological approaches to hermeneutical analysis. """ complexity = await processor._calculate_complexity_score(complex_text) assert 0.0 <= complexity <= 1.0 def test_processing_statistics(self): """Test processing statistics generation.""" processor = KnowledgeProcessor() stats = processor.get_processing_statistics() required_keys = [ 'models_loaded', 'chunk_size', 'chunk_overlap', 'supported_categories', 'device' ] for key in required_keys: assert key in stats assert isinstance(stats['supported_categories'], list) assert len(stats['supported_categories']) > 0 @pytest.mark.asyncio async def test_processed_knowledge_creation(self, sample_book_content): """Test creation of ProcessedKnowledge object.""" processor = KnowledgeProcessor() # Mock the heavy NLP models for testing with patch.object(processor, '_generate_summary') as mock_summary, \ patch.object(processor, '_extract_concepts') as mock_concepts, \ patch.object(processor, '_extract_keywords') as mock_keywords, \ patch.object(processor, '_classify_content') as mock_classify, \ patch.object(processor, '_generate_embedding') as mock_embedding: mock_summary.return_value = "Test summary" mock_concepts.return_value = ["science", "method", "hypothesis"] mock_keywords.return_value = ["scientific", "research", "study"] mock_classify.return_value = ("science", "methodology") mock_embedding.return_value = None result = await processor._process_content( title="The Art of Science", content=sample_book_content, source_metadata={'source': 'test'} ) assert isinstance(result, ProcessedKnowledge) assert result.title == "The Art of Science" assert result.category == "science" assert result.subcategory == "methodology" assert len(result.keywords) > 0 assert len(result.concepts) > 0 @pytest.mark.asyncio async def test_web_content_processing(self): """Test processing of web HTML content.""" processor = KnowledgeProcessor() html_content = """ Test Article

Main Content

This is the main content of the article.

""" with patch.object(processor, '_process_content') as mock_process: mock_process.return_value = Mock(spec=ProcessedKnowledge) await processor.process_web_content(html_content, url="http://test.com") # Should have called _process_content with cleaned text mock_process.assert_called_once() args, kwargs = mock_process.call_args # Should not contain script or nav content assert "alert('test')" not in args[1] assert "Navigation menu" not in args[1] assert "Main Content" in args[1] class TestProcessedKnowledge: """Tests for ProcessedKnowledge data structure.""" def test_processed_knowledge_structure(self): """Test ProcessedKnowledge data structure.""" knowledge = ProcessedKnowledge( title="Test Knowledge", content="Test content", summary="Test summary", category="science", subcategory="physics", keywords=["test", "science"], concepts=["quantum", "mechanics"], quality_score=0.8, complexity_score=0.6, embedding=None, chunks=[], metadata={"source": "test"} ) assert knowledge.title == "Test Knowledge" assert knowledge.category == "science" assert knowledge.quality_score == 0.8 assert len(knowledge.keywords) == 2 assert len(knowledge.concepts) == 2