""" Knowledge processor for extracting, cleaning, and structuring knowledge from various text sources for Lyra's learning. """ import asyncio import logging import re import nltk import spacy from typing import Dict, List, Optional, Tuple, Set, Any from dataclasses import dataclass from pathlib import Path import torch import torch.nn as nn from sentence_transformers import SentenceTransformer from transformers import pipeline import numpy as np from collections import Counter import textstat from bs4 import BeautifulSoup import json logger = logging.getLogger(__name__) @dataclass class ProcessedKnowledge: """Represents processed knowledge ready for storage.""" title: str content: str summary: str category: str subcategory: Optional[str] keywords: List[str] concepts: List[str] quality_score: float complexity_score: float embedding: Optional[np.ndarray] chunks: List[Dict[str, Any]] metadata: Dict[str, Any] @dataclass class TextChunk: """Represents a chunk of text with metadata.""" content: str start_pos: int end_pos: int chunk_type: str # 'paragraph', 'section', 'chapter' importance_score: float concepts: List[str] embedding: Optional[np.ndarray] = None class KnowledgeProcessor: """ Advanced knowledge processor that extracts meaningful information from text sources and prepares it for Lyra's learning. """ def __init__( self, device: Optional[torch.device] = None, embedding_model: str = "all-MiniLM-L6-v2", chunk_size: int = 512, chunk_overlap: int = 50 ): self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu") self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap # NLP models self.nlp = None # Will be loaded lazily self.embedding_model = None self.summarizer = None self.classifier = None # Text processing patterns self.sentence_splitter = re.compile(r'(?<=[.!?])\s+') self.paragraph_splitter = re.compile(r'\n\s*\n') # Knowledge categories and their keywords self.category_keywords = { 'science': [ 'research', 'experiment', 'theory', 'hypothesis', 'data', 'analysis', 'method', 'scientific', 'study', 'physics', 'chemistry', 'biology', 'mathematics', 'astronomy' ], 'history': [ 'century', 'ancient', 'civilization', 'empire', 'war', 'revolution', 'culture', 'society', 'historical', 'period', 'medieval', 'renaissance', 'industrial', 'modern' ], 'philosophy': [ 'ethics', 'morality', 'existence', 'reality', 'consciousness', 'logic', 'reason', 'truth', 'knowledge', 'metaphysics', 'epistemology', 'philosopher', 'philosophical', 'wisdom' ], 'literature': [ 'character', 'plot', 'theme', 'narrative', 'poetry', 'novel', 'story', 'drama', 'author', 'literary', 'fiction', 'metaphor', 'symbolism', 'prose' ], 'art': [ 'painting', 'sculpture', 'artist', 'creative', 'aesthetic', 'beauty', 'design', 'color', 'form', 'style', 'movement', 'gallery', 'museum', 'artistic' ], 'technology': [ 'computer', 'software', 'programming', 'digital', 'internet', 'algorithm', 'innovation', 'engineering', 'technical', 'machine', 'automation', 'electronics', 'invention' ] } # Quality indicators self.quality_indicators = { 'positive': [ 'evidence', 'research', 'study', 'analysis', 'peer-reviewed', 'academic', 'scholarly', 'university', 'institute', 'journal' ], 'negative': [ 'unverified', 'rumor', 'gossip', 'speculation', 'opinion', 'conspiracy', 'myth', 'fake', 'false', 'misleading' ] } async def initialize(self): """Initialize NLP models and resources.""" logger.info("Initializing knowledge processor...") # Download required NLTK data try: nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) nltk.download('wordnet', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) except Exception as e: logger.warning(f"Failed to download some NLTK data: {e}") # Load spaCy model try: self.nlp = spacy.load("en_core_web_sm") except OSError: logger.warning("spaCy model not found, downloading...") spacy.cli.download("en_core_web_sm") self.nlp = spacy.load("en_core_web_sm") # Load embedding model self.embedding_model = SentenceTransformer( "sentence-transformers/all-MiniLM-L6-v2", device=self.device ) # Load summarization model self.summarizer = pipeline( "summarization", model="facebook/bart-large-cnn", device=0 if self.device.type == "cuda" else -1 ) # Load text classification model self.classifier = pipeline( "zero-shot-classification", model="facebook/bart-large-mnli", device=0 if self.device.type == "cuda" else -1 ) logger.info("Knowledge processor initialized successfully") async def process_text_file( self, file_path: Path, title: Optional[str] = None, source_metadata: Optional[Dict[str, Any]] = None ) -> ProcessedKnowledge: """ Process a text file and extract structured knowledge. Args: file_path: Path to the text file title: Optional title (will be extracted if not provided) source_metadata: Additional metadata about the source Returns: ProcessedKnowledge object """ logger.info(f"Processing text file: {file_path}") # Read file content try: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: raw_content = f.read() except Exception as e: logger.error(f"Failed to read file {file_path}: {e}") raise # Detect and clean text format cleaned_content = await self._clean_text(raw_content) # Extract title if not provided if not title: title = await self._extract_title(cleaned_content, file_path.name) # Process the content return await self._process_content( title=title, content=cleaned_content, source_metadata=source_metadata or {} ) async def process_web_content( self, html_content: str, title: Optional[str] = None, url: Optional[str] = None ) -> ProcessedKnowledge: """Process HTML content from web sources.""" # Extract text from HTML soup = BeautifulSoup(html_content, 'html.parser') # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'footer', 'aside']): element.decompose() # Extract title if not title: title_elem = soup.find('title') title = title_elem.get_text(strip=True) if title_elem else "Web Content" # Extract main content main_content = soup.get_text(separator='\n', strip=True) cleaned_content = await self._clean_text(main_content) source_metadata = {'source_type': 'web', 'url': url} return await self._process_content(title, cleaned_content, source_metadata) async def _process_content( self, title: str, content: str, source_metadata: Dict[str, Any] ) -> ProcessedKnowledge: """Core content processing logic.""" # Analyze content structure chunks = await self._chunk_text(content) # Extract concepts and keywords concepts = await self._extract_concepts(content) keywords = await self._extract_keywords(content) # Classify content category, subcategory = await self._classify_content(content, title) # Calculate quality scores quality_score = await self._calculate_quality_score(content, title) complexity_score = await self._calculate_complexity_score(content) # Generate summary summary = await self._generate_summary(content) # Generate embeddings content_embedding = await self._generate_embedding(content) # Process chunks with embeddings processed_chunks = [] for chunk in chunks: chunk_embedding = await self._generate_embedding(chunk.content) chunk_dict = { 'content': chunk.content, 'start_pos': chunk.start_pos, 'end_pos': chunk.end_pos, 'chunk_type': chunk.chunk_type, 'importance_score': chunk.importance_score, 'concepts': chunk.concepts, 'embedding': chunk_embedding.tolist() if chunk_embedding is not None else None } processed_chunks.append(chunk_dict) # Prepare metadata metadata = { **source_metadata, 'processing_timestamp': str(asyncio.get_event_loop().time()), 'word_count': len(content.split()), 'sentence_count': len(self.sentence_splitter.split(content)), 'paragraph_count': len(self.paragraph_splitter.split(content)), 'readability_score': textstat.flesch_reading_ease(content), 'language': 'en' # Could be detected } return ProcessedKnowledge( title=title, content=content, summary=summary, category=category, subcategory=subcategory, keywords=keywords, concepts=concepts, quality_score=quality_score, complexity_score=complexity_score, embedding=content_embedding, chunks=processed_chunks, metadata=metadata ) async def _clean_text(self, raw_content: str) -> str: """Clean and normalize text content.""" # Remove excessive whitespace content = re.sub(r'\n\s*\n\s*\n', '\n\n', raw_content) content = re.sub(r'[ \t]+', ' ', content) # Remove common Gutenberg headers/footers content = re.sub( r'\*\*\*\s*START OF .*?\*\*\*.*?\n', '', content, flags=re.DOTALL | re.IGNORECASE ) content = re.sub( r'\*\*\*\s*END OF .*?\*\*\*.*', '', content, flags=re.DOTALL | re.IGNORECASE ) # Remove page numbers and chapter markers that might interfere content = re.sub(r'\n\s*\d+\s*\n', '\n', content) content = re.sub(r'\n\s*Page \d+\s*\n', '\n', content, flags=re.IGNORECASE) # Normalize quotes and dashes content = content.replace('"', '"').replace('"', '"') content = content.replace(''', "'").replace(''', "'") content = content.replace('—', '--').replace('–', '-') return content.strip() async def _extract_title(self, content: str, filename: str) -> str: """Extract title from content or filename.""" lines = content.split('\n')[:10] # Check first 10 lines # Look for title patterns for line in lines: line = line.strip() if len(line) > 10 and len(line) < 100: # Check if line looks like a title if line.isupper() or line.istitle(): return line # Extract from filename as fallback title = filename.replace('_', ' ').replace('-', ' ') title = re.sub(r'\.[^.]+$', '', title) # Remove extension title = re.sub(r'^\d+_?', '', title) # Remove leading numbers return title.title() async def _chunk_text(self, content: str) -> List[TextChunk]: """Split text into meaningful chunks.""" chunks = [] paragraphs = self.paragraph_splitter.split(content) current_pos = 0 for paragraph in paragraphs: if len(paragraph.strip()) < 50: # Skip very short paragraphs current_pos += len(paragraph) + 2 # +2 for newlines continue # Determine chunk type chunk_type = self._determine_chunk_type(paragraph) # Calculate importance score importance_score = await self._calculate_chunk_importance(paragraph) # Extract concepts from chunk chunk_concepts = await self._extract_chunk_concepts(paragraph) chunk = TextChunk( content=paragraph.strip(), start_pos=current_pos, end_pos=current_pos + len(paragraph), chunk_type=chunk_type, importance_score=importance_score, concepts=chunk_concepts ) chunks.append(chunk) current_pos += len(paragraph) + 2 return chunks def _determine_chunk_type(self, paragraph: str) -> str: """Determine the type of text chunk.""" if len(paragraph) < 100: return 'short_paragraph' elif any(keyword in paragraph.lower() for keyword in ['chapter', 'section', 'part']): return 'section_header' elif paragraph.strip().endswith(':'): return 'list_header' else: return 'paragraph' async def _calculate_chunk_importance(self, chunk: str) -> float: """Calculate importance score for a text chunk.""" score = 0.5 # Base score # Length factor (not too short, not too long) length = len(chunk.split()) if 50 <= length <= 200: score += 0.1 elif length < 20: score -= 0.2 # Keyword density important_words = [ 'important', 'significant', 'crucial', 'essential', 'key', 'fundamental', 'principle', 'concept', 'theory', 'discovery' ] keyword_count = sum(1 for word in important_words if word in chunk.lower()) score += min(0.3, keyword_count * 0.1) # Question presence (often indicates important information) question_count = chunk.count('?') score += min(0.2, question_count * 0.05) # Technical terms (using simple heuristic) doc = self.nlp(chunk[:1000]) # Limit for performance technical_terms = [ token for token in doc if token.pos_ in ['NOUN', 'PROPN'] and len(token.text) > 6 ] score += min(0.2, len(technical_terms) * 0.01) return min(1.0, max(0.0, score)) async def _extract_concepts(self, content: str) -> List[str]: """Extract key concepts from content.""" doc = self.nlp(content[:5000]) # Limit for performance # Extract noun phrases as concepts concepts = [] for chunk in doc.noun_chunks: if len(chunk.text) > 3 and len(chunk.text.split()) <= 3: concepts.append(chunk.text.lower()) # Extract named entities for ent in doc.ents: if ent.label_ in ['PERSON', 'ORG', 'GPE', 'EVENT', 'WORK_OF_ART']: concepts.append(ent.text.lower()) # Remove duplicates and return top concepts concept_counts = Counter(concepts) return [concept for concept, count in concept_counts.most_common(20)] async def _extract_chunk_concepts(self, chunk: str) -> List[str]: """Extract concepts from a specific chunk.""" doc = self.nlp(chunk[:1000]) # Limit for performance concepts = [] for chunk_span in doc.noun_chunks: if len(chunk_span.text) > 3: concepts.append(chunk_span.text.lower()) for ent in doc.ents: concepts.append(ent.text.lower()) return list(set(concepts))[:10] # Return unique concepts, limited async def _extract_keywords(self, content: str) -> List[str]: """Extract keywords from content.""" doc = self.nlp(content[:5000]) # Limit for performance # Extract meaningful words keywords = [] for token in doc: if (token.pos_ in ['NOUN', 'ADJ', 'VERB'] and not token.is_stop and not token.is_punct and len(token.text) > 3): keywords.append(token.lemma_.lower()) # Count frequency and return top keywords keyword_counts = Counter(keywords) return [word for word, count in keyword_counts.most_common(15)] async def _classify_content(self, content: str, title: str) -> Tuple[str, Optional[str]]: """Classify content into categories.""" # Combine title and first part of content for classification classification_text = f"{title}. {content[:1000]}" # Use keyword-based classification first (faster) category_scores = {} for category, keywords in self.category_keywords.items(): score = sum(1 for keyword in keywords if keyword in classification_text.lower()) category_scores[category] = score if category_scores and max(category_scores.values()) > 0: category = max(category_scores, key=category_scores.get) else: # Fallback to ML classification categories = list(self.category_keywords.keys()) try: result = self.classifier(classification_text, categories) category = result['labels'][0] except Exception as e: logger.warning(f"Classification failed: {e}") category = 'general' # Determine subcategory based on more specific analysis subcategory = await self._determine_subcategory(content, category) return category, subcategory async def _determine_subcategory(self, content: str, category: str) -> Optional[str]: """Determine subcategory based on content analysis.""" subcategory_mapping = { 'science': { 'physics': ['physics', 'quantum', 'relativity', 'mechanics'], 'biology': ['biology', 'evolution', 'genetics', 'species'], 'chemistry': ['chemistry', 'chemical', 'molecule', 'reaction'], 'astronomy': ['astronomy', 'space', 'universe', 'planet', 'star'] }, 'history': { 'ancient': ['ancient', 'rome', 'greece', 'egypt', 'civilization'], 'medieval': ['medieval', 'middle ages', 'feudal', 'knight'], 'modern': ['modern', 'industrial', 'revolution', 'war', 'century'] }, 'literature': { 'fiction': ['novel', 'story', 'character', 'plot'], 'poetry': ['poem', 'verse', 'rhyme', 'stanza'], 'drama': ['play', 'theater', 'act', 'scene'] } } if category in subcategory_mapping: content_lower = content[:2000].lower() subcategory_scores = {} for subcategory, keywords in subcategory_mapping[category].items(): score = sum(1 for keyword in keywords if keyword in content_lower) subcategory_scores[subcategory] = score if subcategory_scores and max(subcategory_scores.values()) > 0: return max(subcategory_scores, key=subcategory_scores.get) return None async def _calculate_quality_score(self, content: str, title: str) -> float: """Calculate quality score for content.""" score = 0.5 # Base score # Content length (optimal range) word_count = len(content.split()) if 500 <= word_count <= 10000: score += 0.1 elif word_count < 100: score -= 0.2 # Readability try: readability = textstat.flesch_reading_ease(content) if 30 <= readability <= 70: # Reasonable complexity score += 0.1 except: pass # Quality indicators content_lower = content.lower() positive_indicators = sum( 1 for indicator in self.quality_indicators['positive'] if indicator in content_lower ) negative_indicators = sum( 1 for indicator in self.quality_indicators['negative'] if indicator in content_lower ) score += min(0.2, positive_indicators * 0.05) score -= min(0.3, negative_indicators * 0.1) # Title quality if len(title.split()) >= 3 and not title.isupper(): score += 0.05 return min(1.0, max(0.0, score)) async def _calculate_complexity_score(self, content: str) -> float: """Calculate complexity score for content.""" try: # Use various readability metrics flesch_score = textstat.flesch_reading_ease(content) flesch_kincaid = textstat.flesch_kincaid_grade(content) # Normalize to 0-1 scale complexity = 1.0 - (flesch_score / 100.0) complexity = max(0.0, min(1.0, complexity)) return complexity except: return 0.5 # Default complexity async def _generate_summary(self, content: str) -> str: """Generate summary of content.""" try: # Limit content length for summarization max_length = 1024 if len(content) > max_length: # Take first part of content content_to_summarize = content[:max_length] else: content_to_summarize = content # Generate summary summary_result = self.summarizer( content_to_summarize, max_length=150, min_length=50, do_sample=False ) return summary_result[0]['summary_text'] except Exception as e: logger.warning(f"Summarization failed: {e}") # Fallback: return first few sentences sentences = self.sentence_splitter.split(content)[:3] return ' '.join(sentences) async def _generate_embedding(self, text: str) -> Optional[np.ndarray]: """Generate embedding for text.""" try: # Limit text length if len(text) > 500: text = text[:500] embedding = self.embedding_model.encode(text, convert_to_numpy=True) return embedding except Exception as e: logger.warning(f"Embedding generation failed: {e}") return None def get_processing_statistics(self) -> Dict[str, Any]: """Get statistics about processed knowledge.""" return { 'models_loaded': { 'nlp': self.nlp is not None, 'embedding_model': self.embedding_model is not None, 'summarizer': self.summarizer is not None, 'classifier': self.classifier is not None }, 'chunk_size': self.chunk_size, 'chunk_overlap': self.chunk_overlap, 'supported_categories': list(self.category_keywords.keys()), 'device': str(self.device) }