Lyra/tests/test_knowledge_systems.py

"""
Tests for knowledge acquisition and processing systems.
"""

import pytest
import asyncio
from pathlib import Path
from unittest.mock import Mock, AsyncMock, patch
import json

from lyra.knowledge.gutenberg_crawler import GutenbergCrawler, GutenbergBook
from lyra.knowledge.knowledge_processor import KnowledgeProcessor, ProcessedKnowledge
from tests.conftest import create_mock_response


class TestGutenbergBook:
    """Tests for Gutenberg book representation."""

    def test_gutenberg_book_initialization(self, sample_gutenberg_book):
        """Test Gutenberg book initialization."""
        book = sample_gutenberg_book

        assert book.id == 12345
        assert book.title == "Sample Public Domain Book"
        assert book.author == "Test Author"
        assert book.language == "en"
        assert book.category == "Fiction"
        assert book.copyright_status == "public_domain"
        assert book.quality_score == 0.8
        assert book.metadata is not None

    def test_gutenberg_book_post_init(self):
        """Test book post-initialization."""
        book = GutenbergBook(
            id=1,
            title="Test",
            author="Author",
            language="en",
            category="Test",
            url="http://test.com",
            file_format="txt",
            download_url="http://test.com/file.txt"
        )

        assert book.metadata == {}  # Should initialize empty dict


class TestGutenbergCrawler:
    """Tests for the Gutenberg crawler."""

    def test_crawler_initialization(self):
        """Test crawler initialization."""
        crawler = GutenbergCrawler(
            base_url="https://www.gutenberg.org",
            rate_limit=1.0,
            max_concurrent=2
        )

        assert crawler.base_url == "https://www.gutenberg.org"
        assert crawler.rate_limit == 1.0
        assert crawler.max_concurrent == 2
        assert len(crawler.crawled_books) == 0
        assert len(crawler.failed_downloads) == 0

    @pytest.mark.asyncio
    async def test_crawler_async_context_manager(self):
        """Test crawler as async context manager."""
        with patch.object(GutenbergCrawler, '_verify_gutenberg_access', new_callable=AsyncMock):
            async with GutenbergCrawler() as crawler:
                assert crawler.session is not None

    @pytest.mark.asyncio
    async def test_book_details_extraction(self):
        """Test extraction of book details."""
        crawler = GutenbergCrawler()

        # Mock HTML content
        mock_html = """
        <html>
            <head><title>Test Book</title></head>
            <body>
                <a href="/browse/authors/test">Test Author</a>
                <tr>Language:</tr>
                <td>English</td>
                <a href="/files/123/123-0.txt">Download TXT</a>
            </body>
        </html>
        """

        with patch.object(crawler, '_rate_limited_request') as mock_request:
            mock_response = Mock()
            mock_response.status = 200
            mock_response.text = AsyncMock(return_value=mock_html)
            mock_request.return_value = mock_response

            book = await crawler._get_book_details(123, "Test Book", "Fiction")

            assert book is not None
            assert book.id == 123
            assert book.title == "Test Book"
            assert book.category == "Fiction"

    def test_download_appropriateness_check(self, sample_gutenberg_book):
        """Test checking if a book is appropriate for download."""
        crawler = GutenbergCrawler()

        # Should be appropriate (public domain, allowed format)
        assert crawler._is_download_appropriate(sample_gutenberg_book) is True

        # Test with excluded language
        crawler.excluded_languages = ['en']
        assert crawler._is_download_appropriate(sample_gutenberg_book) is False

        # Test with disallowed format
        crawler.excluded_languages = []
        sample_gutenberg_book.file_format = 'pdf'
        assert crawler._is_download_appropriate(sample_gutenberg_book) is False

        # Test with non-public domain
        sample_gutenberg_book.file_format = 'txt'
        sample_gutenberg_book.copyright_status = 'copyrighted'
        assert crawler._is_download_appropriate(sample_gutenberg_book) is False

    @pytest.mark.asyncio
    async def test_legal_validation(self, sample_gutenberg_book):
        """Test legal status validation."""
        crawler = GutenbergCrawler()

        # Public domain book should be valid
        is_valid = await crawler.validate_legal_status(sample_gutenberg_book)
        assert is_valid is True

        # Test with non-public domain
        sample_gutenberg_book.copyright_status = "copyrighted"
        is_valid = await crawler.validate_legal_status(sample_gutenberg_book)
        assert is_valid is True  # Still returns True for Gutenberg books

    def test_file_format_determination(self):
        """Test file format determination from URL."""
        crawler = GutenbergCrawler()

        test_cases = [
            ("http://example.com/book.txt", "txt"),
            ("http://example.com/book.html", "html"),
            ("http://example.com/book.epub", "epub"),
            ("http://example.com/book", "txt")  # Default
        ]

        for url, expected_format in test_cases:
            result = crawler._determine_file_format(url)
            assert result == expected_format

    def test_download_statistics(self):
        """Test download statistics generation."""
        crawler = GutenbergCrawler()

        # Add some mock data
        book1 = GutenbergBook(1, "Book 1", "Author 1", "en", "Fiction",
                             "url1", "txt", "download1", quality_score=0.8)
        book2 = GutenbergBook(2, "Book 2", "Author 2", "fr", "Science",
                             "url2", "html", "download2", quality_score=0.9)

        crawler.crawled_books = {1: book1, 2: book2}
        crawler.failed_downloads = [3, 4]

        stats = crawler.get_download_statistics()

        assert stats['total_discovered'] == 2
        assert stats['failed_downloads'] == 2
        assert stats['success_rate'] == 0.5  # 2 success, 2 failures
        assert 'en' in stats['languages_discovered']
        assert 'fr' in stats['languages_discovered']
        assert 'Fiction' in stats['categories_discovered']
        assert 'Science' in stats['categories_discovered']

    @pytest.mark.asyncio
    async def test_book_recommendations(self):
        """Test book recommendation generation."""
        crawler = GutenbergCrawler()

        with patch.object(crawler, '_discover_books_in_category') as mock_discover:
            async def mock_generator(category, languages):
                if category == "Science":
                    yield GutenbergBook(1, "Science Book", "Author", "en",
                                      "Science", "url", "txt", "download")

            mock_discover.return_value = mock_generator("Science", ["en"])

            recommendations = await crawler.get_book_recommendations(
                interests=['science'], limit=5
            )

            assert len(recommendations) >= 0  # May be empty due to mocking


class TestKnowledgeProcessor:
    """Tests for knowledge processing system."""

    @pytest.mark.asyncio
    async def test_processor_initialization(self, device):
        """Test knowledge processor initialization."""
        processor = KnowledgeProcessor(
            device=device,
            chunk_size=256,
            chunk_overlap=25
        )

        assert processor.device == device
        assert processor.chunk_size == 256
        assert processor.chunk_overlap == 25
        assert processor.nlp is None  # Loaded lazily

    @pytest.mark.asyncio
    async def test_text_cleaning(self):
        """Test text cleaning functionality."""
        processor = KnowledgeProcessor()

        # Test text with common Gutenberg artifacts
        dirty_text = """
        *** START OF THE PROJECT GUTENBERG EBOOK TEST ***

        This is the actual content.
        It has    multiple    spaces.

        And multiple


        newlines.

        *** END OF THE PROJECT GUTENBERG EBOOK TEST ***
        """

        cleaned = await processor._clean_text(dirty_text)

        assert "*** START OF" not in cleaned
        assert "*** END OF" not in cleaned
        assert "multiple    spaces" not in cleaned
        assert cleaned.count('\n\n\n') == 0  # No triple newlines

    @pytest.mark.asyncio
    async def test_title_extraction(self):
        """Test title extraction from content and filename."""
        processor = KnowledgeProcessor()

        # Test with content containing title
        content_with_title = """
        THE GREAT WORK

        Chapter 1

        This is the beginning of the story...
        """

        title = await processor._extract_title(content_with_title, "test_file.txt")
        assert "GREAT WORK" in title

        # Test with filename fallback
        title = await processor._extract_title("No clear title here", "12345_the_book_title.txt")
        assert "Book Title" in title

    def test_chunk_type_determination(self):
        """Test text chunk type determination."""
        processor = KnowledgeProcessor()

        test_cases = [
            ("Short text", "short_paragraph"),
            ("Chapter 1: Introduction", "section_header"),
            ("This is a normal paragraph with sufficient length to be classified properly.", "paragraph"),
            ("List of items:", "list_header")
        ]

        for text, expected_type in test_cases:
            result = processor._determine_chunk_type(text)
            assert result == expected_type

    @pytest.mark.asyncio
    async def test_quality_score_calculation(self):
        """Test content quality score calculation."""
        processor = KnowledgeProcessor()

        # High quality content
        high_quality = """
        This is a well-researched scientific study that presents important
        findings based on rigorous analysis. The research methodology was
        peer-reviewed and published in an academic journal. The results
        show significant evidence for the hypothesis tested.
        """ * 10  # Make it longer

        quality = await processor._calculate_quality_score(high_quality, "Scientific Research Study")
        assert quality > 0.5

        # Lower quality content
        low_quality = "unverified rumor gossip speculation fake news"

        quality = await processor._calculate_quality_score(low_quality, "Gossip")
        assert quality < 0.5

    @pytest.mark.asyncio
    async def test_category_classification(self):
        """Test content category classification."""
        processor = KnowledgeProcessor()

        # Science content
        science_content = """
        This research examines the quantum mechanics of particle physics.
        The experiment was conducted using advanced scientific methods
        to test the hypothesis about atomic behavior.
        """

        category, subcategory = await processor._classify_content(
            science_content, "Quantum Physics Research"
        )
        assert category == "science"

        # History content
        history_content = """
        The ancient Roman Empire was a vast civilization that
        dominated the Mediterranean world for centuries. The empire's
        military conquests and cultural achievements shaped history.
        """

        category, subcategory = await processor._classify_content(
            history_content, "Roman Empire History"
        )
        assert category == "history"

    @pytest.mark.asyncio
    async def test_complexity_score_calculation(self):
        """Test complexity score calculation."""
        processor = KnowledgeProcessor()

        # Simple text
        simple_text = "This is easy to read. The words are simple. Anyone can understand this."
        complexity = await processor._calculate_complexity_score(simple_text)
        assert 0.0 <= complexity <= 1.0

        # Complex text
        complex_text = """
        The epistemological ramifications of phenomenological investigations
        require sophisticated methodological approaches to hermeneutical analysis.
        """
        complexity = await processor._calculate_complexity_score(complex_text)
        assert 0.0 <= complexity <= 1.0

    def test_processing_statistics(self):
        """Test processing statistics generation."""
        processor = KnowledgeProcessor()

        stats = processor.get_processing_statistics()

        required_keys = [
            'models_loaded', 'chunk_size', 'chunk_overlap',
            'supported_categories', 'device'
        ]

        for key in required_keys:
            assert key in stats

        assert isinstance(stats['supported_categories'], list)
        assert len(stats['supported_categories']) > 0

    @pytest.mark.asyncio
    async def test_processed_knowledge_creation(self, sample_book_content):
        """Test creation of ProcessedKnowledge object."""
        processor = KnowledgeProcessor()

        # Mock the heavy NLP models for testing
        with patch.object(processor, '_generate_summary') as mock_summary, \
             patch.object(processor, '_extract_concepts') as mock_concepts, \
             patch.object(processor, '_extract_keywords') as mock_keywords, \
             patch.object(processor, '_classify_content') as mock_classify, \
             patch.object(processor, '_generate_embedding') as mock_embedding:

            mock_summary.return_value = "Test summary"
            mock_concepts.return_value = ["science", "method", "hypothesis"]
            mock_keywords.return_value = ["scientific", "research", "study"]
            mock_classify.return_value = ("science", "methodology")
            mock_embedding.return_value = None

            result = await processor._process_content(
                title="The Art of Science",
                content=sample_book_content,
                source_metadata={'source': 'test'}
            )

            assert isinstance(result, ProcessedKnowledge)
            assert result.title == "The Art of Science"
            assert result.category == "science"
            assert result.subcategory == "methodology"
            assert len(result.keywords) > 0
            assert len(result.concepts) > 0

    @pytest.mark.asyncio
    async def test_web_content_processing(self):
        """Test processing of web HTML content."""
        processor = KnowledgeProcessor()

        html_content = """
        <html>
            <head><title>Test Article</title></head>
            <body>
                <nav>Navigation menu</nav>
                <article>
                    <h1>Main Content</h1>
                    <p>This is the main content of the article.</p>
                </article>
                <footer>Footer content</footer>
                <script>alert('test');</script>
            </body>
        </html>
        """

        with patch.object(processor, '_process_content') as mock_process:
            mock_process.return_value = Mock(spec=ProcessedKnowledge)

            await processor.process_web_content(html_content, url="http://test.com")

            # Should have called _process_content with cleaned text
            mock_process.assert_called_once()
            args, kwargs = mock_process.call_args

            # Should not contain script or nav content
            assert "alert('test')" not in args[1]
            assert "Navigation menu" not in args[1]
            assert "Main Content" in args[1]


class TestProcessedKnowledge:
    """Tests for ProcessedKnowledge data structure."""

    def test_processed_knowledge_structure(self):
        """Test ProcessedKnowledge data structure."""
        knowledge = ProcessedKnowledge(
            title="Test Knowledge",
            content="Test content",
            summary="Test summary",
            category="science",
            subcategory="physics",
            keywords=["test", "science"],
            concepts=["quantum", "mechanics"],
            quality_score=0.8,
            complexity_score=0.6,
            embedding=None,
            chunks=[],
            metadata={"source": "test"}
        )

        assert knowledge.title == "Test Knowledge"
        assert knowledge.category == "science"
        assert knowledge.quality_score == 0.8
        assert len(knowledge.keywords) == 2
        assert len(knowledge.concepts) == 2