Download TXT
"""
with patch.object(crawler, '_rate_limited_request') as mock_request:
mock_response = Mock()
mock_response.status = 200
mock_response.text = AsyncMock(return_value=mock_html)
mock_request.return_value = mock_response
book = await crawler._get_book_details(123, "Test Book", "Fiction")
assert book is not None
assert book.id == 123
assert book.title == "Test Book"
assert book.category == "Fiction"
def test_download_appropriateness_check(self, sample_gutenberg_book):
"""Test checking if a book is appropriate for download."""
crawler = GutenbergCrawler()
# Should be appropriate (public domain, allowed format)
assert crawler._is_download_appropriate(sample_gutenberg_book) is True
# Test with excluded language
crawler.excluded_languages = ['en']
assert crawler._is_download_appropriate(sample_gutenberg_book) is False
# Test with disallowed format
crawler.excluded_languages = []
sample_gutenberg_book.file_format = 'pdf'
assert crawler._is_download_appropriate(sample_gutenberg_book) is False
# Test with non-public domain
sample_gutenberg_book.file_format = 'txt'
sample_gutenberg_book.copyright_status = 'copyrighted'
assert crawler._is_download_appropriate(sample_gutenberg_book) is False
@pytest.mark.asyncio
async def test_legal_validation(self, sample_gutenberg_book):
"""Test legal status validation."""
crawler = GutenbergCrawler()
# Public domain book should be valid
is_valid = await crawler.validate_legal_status(sample_gutenberg_book)
assert is_valid is True
# Test with non-public domain
sample_gutenberg_book.copyright_status = "copyrighted"
is_valid = await crawler.validate_legal_status(sample_gutenberg_book)
assert is_valid is True # Still returns True for Gutenberg books
def test_file_format_determination(self):
"""Test file format determination from URL."""
crawler = GutenbergCrawler()
test_cases = [
("http://example.com/book.txt", "txt"),
("http://example.com/book.html", "html"),
("http://example.com/book.epub", "epub"),
("http://example.com/book", "txt") # Default
]
for url, expected_format in test_cases:
result = crawler._determine_file_format(url)
assert result == expected_format
def test_download_statistics(self):
"""Test download statistics generation."""
crawler = GutenbergCrawler()
# Add some mock data
book1 = GutenbergBook(1, "Book 1", "Author 1", "en", "Fiction",
"url1", "txt", "download1", quality_score=0.8)
book2 = GutenbergBook(2, "Book 2", "Author 2", "fr", "Science",
"url2", "html", "download2", quality_score=0.9)
crawler.crawled_books = {1: book1, 2: book2}
crawler.failed_downloads = [3, 4]
stats = crawler.get_download_statistics()
assert stats['total_discovered'] == 2
assert stats['failed_downloads'] == 2
assert stats['success_rate'] == 0.5 # 2 success, 2 failures
assert 'en' in stats['languages_discovered']
assert 'fr' in stats['languages_discovered']
assert 'Fiction' in stats['categories_discovered']
assert 'Science' in stats['categories_discovered']
@pytest.mark.asyncio
async def test_book_recommendations(self):
"""Test book recommendation generation."""
crawler = GutenbergCrawler()
with patch.object(crawler, '_discover_books_in_category') as mock_discover:
async def mock_generator(category, languages):
if category == "Science":
yield GutenbergBook(1, "Science Book", "Author", "en",
"Science", "url", "txt", "download")
mock_discover.return_value = mock_generator("Science", ["en"])
recommendations = await crawler.get_book_recommendations(
interests=['science'], limit=5
)
assert len(recommendations) >= 0 # May be empty due to mocking
class TestKnowledgeProcessor:
"""Tests for knowledge processing system."""
@pytest.mark.asyncio
async def test_processor_initialization(self, device):
"""Test knowledge processor initialization."""
processor = KnowledgeProcessor(
device=device,
chunk_size=256,
chunk_overlap=25
)
assert processor.device == device
assert processor.chunk_size == 256
assert processor.chunk_overlap == 25
assert processor.nlp is None # Loaded lazily
@pytest.mark.asyncio
async def test_text_cleaning(self):
"""Test text cleaning functionality."""
processor = KnowledgeProcessor()
# Test text with common Gutenberg artifacts
dirty_text = """
*** START OF THE PROJECT GUTENBERG EBOOK TEST ***
This is the actual content.
It has multiple spaces.
And multiple
newlines.
*** END OF THE PROJECT GUTENBERG EBOOK TEST ***
"""
cleaned = await processor._clean_text(dirty_text)
assert "*** START OF" not in cleaned
assert "*** END OF" not in cleaned
assert "multiple spaces" not in cleaned
assert cleaned.count('\n\n\n') == 0 # No triple newlines
@pytest.mark.asyncio
async def test_title_extraction(self):
"""Test title extraction from content and filename."""
processor = KnowledgeProcessor()
# Test with content containing title
content_with_title = """
THE GREAT WORK
Chapter 1
This is the beginning of the story...
"""
title = await processor._extract_title(content_with_title, "test_file.txt")
assert "GREAT WORK" in title
# Test with filename fallback
title = await processor._extract_title("No clear title here", "12345_the_book_title.txt")
assert "Book Title" in title
def test_chunk_type_determination(self):
"""Test text chunk type determination."""
processor = KnowledgeProcessor()
test_cases = [
("Short text", "short_paragraph"),
("Chapter 1: Introduction", "section_header"),
("This is a normal paragraph with sufficient length to be classified properly.", "paragraph"),
("List of items:", "list_header")
]
for text, expected_type in test_cases:
result = processor._determine_chunk_type(text)
assert result == expected_type
@pytest.mark.asyncio
async def test_quality_score_calculation(self):
"""Test content quality score calculation."""
processor = KnowledgeProcessor()
# High quality content
high_quality = """
This is a well-researched scientific study that presents important
findings based on rigorous analysis. The research methodology was
peer-reviewed and published in an academic journal. The results
show significant evidence for the hypothesis tested.
""" * 10 # Make it longer
quality = await processor._calculate_quality_score(high_quality, "Scientific Research Study")
assert quality > 0.5
# Lower quality content
low_quality = "unverified rumor gossip speculation fake news"
quality = await processor._calculate_quality_score(low_quality, "Gossip")
assert quality < 0.5
@pytest.mark.asyncio
async def test_category_classification(self):
"""Test content category classification."""
processor = KnowledgeProcessor()
# Science content
science_content = """
This research examines the quantum mechanics of particle physics.
The experiment was conducted using advanced scientific methods
to test the hypothesis about atomic behavior.
"""
category, subcategory = await processor._classify_content(
science_content, "Quantum Physics Research"
)
assert category == "science"
# History content
history_content = """
The ancient Roman Empire was a vast civilization that
dominated the Mediterranean world for centuries. The empire's
military conquests and cultural achievements shaped history.
"""
category, subcategory = await processor._classify_content(
history_content, "Roman Empire History"
)
assert category == "history"
@pytest.mark.asyncio
async def test_complexity_score_calculation(self):
"""Test complexity score calculation."""
processor = KnowledgeProcessor()
# Simple text
simple_text = "This is easy to read. The words are simple. Anyone can understand this."
complexity = await processor._calculate_complexity_score(simple_text)
assert 0.0 <= complexity <= 1.0
# Complex text
complex_text = """
The epistemological ramifications of phenomenological investigations
require sophisticated methodological approaches to hermeneutical analysis.
"""
complexity = await processor._calculate_complexity_score(complex_text)
assert 0.0 <= complexity <= 1.0
def test_processing_statistics(self):
"""Test processing statistics generation."""
processor = KnowledgeProcessor()
stats = processor.get_processing_statistics()
required_keys = [
'models_loaded', 'chunk_size', 'chunk_overlap',
'supported_categories', 'device'
]
for key in required_keys:
assert key in stats
assert isinstance(stats['supported_categories'], list)
assert len(stats['supported_categories']) > 0
@pytest.mark.asyncio
async def test_processed_knowledge_creation(self, sample_book_content):
"""Test creation of ProcessedKnowledge object."""
processor = KnowledgeProcessor()
# Mock the heavy NLP models for testing
with patch.object(processor, '_generate_summary') as mock_summary, \
patch.object(processor, '_extract_concepts') as mock_concepts, \
patch.object(processor, '_extract_keywords') as mock_keywords, \
patch.object(processor, '_classify_content') as mock_classify, \
patch.object(processor, '_generate_embedding') as mock_embedding:
mock_summary.return_value = "Test summary"
mock_concepts.return_value = ["science", "method", "hypothesis"]
mock_keywords.return_value = ["scientific", "research", "study"]
mock_classify.return_value = ("science", "methodology")
mock_embedding.return_value = None
result = await processor._process_content(
title="The Art of Science",
content=sample_book_content,
source_metadata={'source': 'test'}
)
assert isinstance(result, ProcessedKnowledge)
assert result.title == "The Art of Science"
assert result.category == "science"
assert result.subcategory == "methodology"
assert len(result.keywords) > 0
assert len(result.concepts) > 0
@pytest.mark.asyncio
async def test_web_content_processing(self):
"""Test processing of web HTML content."""
processor = KnowledgeProcessor()
html_content = """
Test Article
Main Content
This is the main content of the article.
"""
with patch.object(processor, '_process_content') as mock_process:
mock_process.return_value = Mock(spec=ProcessedKnowledge)
await processor.process_web_content(html_content, url="http://test.com")
# Should have called _process_content with cleaned text
mock_process.assert_called_once()
args, kwargs = mock_process.call_args
# Should not contain script or nav content
assert "alert('test')" not in args[1]
assert "Navigation menu" not in args[1]
assert "Main Content" in args[1]
class TestProcessedKnowledge:
"""Tests for ProcessedKnowledge data structure."""
def test_processed_knowledge_structure(self):
"""Test ProcessedKnowledge data structure."""
knowledge = ProcessedKnowledge(
title="Test Knowledge",
content="Test content",
summary="Test summary",
category="science",
subcategory="physics",
keywords=["test", "science"],
concepts=["quantum", "mechanics"],
quality_score=0.8,
complexity_score=0.6,
embedding=None,
chunks=[],
metadata={"source": "test"}
)
assert knowledge.title == "Test Knowledge"
assert knowledge.category == "science"
assert knowledge.quality_score == 0.8
assert len(knowledge.keywords) == 2
assert len(knowledge.concepts) == 2