Lyra/lyra/knowledge/knowledge_processor.py

"""
Knowledge processor for extracting, cleaning, and structuring knowledge
from various text sources for Lyra's learning.
"""

import asyncio
import logging
import re
import nltk
import spacy
from typing import Dict, List, Optional, Tuple, Set, Any
from dataclasses import dataclass
from pathlib import Path
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import numpy as np
from collections import Counter
import textstat
from bs4 import BeautifulSoup
import json

logger = logging.getLogger(__name__)


@dataclass
class ProcessedKnowledge:
    """Represents processed knowledge ready for storage."""
    title: str
    content: str
    summary: str
    category: str
    subcategory: Optional[str]
    keywords: List[str]
    concepts: List[str]
    quality_score: float
    complexity_score: float
    embedding: Optional[np.ndarray]
    chunks: List[Dict[str, Any]]
    metadata: Dict[str, Any]


@dataclass
class TextChunk:
    """Represents a chunk of text with metadata."""
    content: str
    start_pos: int
    end_pos: int
    chunk_type: str  # 'paragraph', 'section', 'chapter'
    importance_score: float
    concepts: List[str]
    embedding: Optional[np.ndarray] = None


class KnowledgeProcessor:
    """
    Advanced knowledge processor that extracts meaningful information
    from text sources and prepares it for Lyra's learning.
    """

    def __init__(
        self,
        device: Optional[torch.device] = None,
        embedding_model: str = "all-MiniLM-L6-v2",
        chunk_size: int = 512,
        chunk_overlap: int = 50
    ):
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        # NLP models
        self.nlp = None  # Will be loaded lazily
        self.embedding_model = None
        self.summarizer = None
        self.classifier = None

        # Text processing patterns
        self.sentence_splitter = re.compile(r'(?<=[.!?])\s+')
        self.paragraph_splitter = re.compile(r'\n\s*\n')

        # Knowledge categories and their keywords
        self.category_keywords = {
            'science': [
                'research', 'experiment', 'theory', 'hypothesis', 'data',
                'analysis', 'method', 'scientific', 'study', 'physics',
                'chemistry', 'biology', 'mathematics', 'astronomy'
            ],
            'history': [
                'century', 'ancient', 'civilization', 'empire', 'war',
                'revolution', 'culture', 'society', 'historical', 'period',
                'medieval', 'renaissance', 'industrial', 'modern'
            ],
            'philosophy': [
                'ethics', 'morality', 'existence', 'reality', 'consciousness',
                'logic', 'reason', 'truth', 'knowledge', 'metaphysics',
                'epistemology', 'philosopher', 'philosophical', 'wisdom'
            ],
            'literature': [
                'character', 'plot', 'theme', 'narrative', 'poetry',
                'novel', 'story', 'drama', 'author', 'literary',
                'fiction', 'metaphor', 'symbolism', 'prose'
            ],
            'art': [
                'painting', 'sculpture', 'artist', 'creative', 'aesthetic',
                'beauty', 'design', 'color', 'form', 'style',
                'movement', 'gallery', 'museum', 'artistic'
            ],
            'technology': [
                'computer', 'software', 'programming', 'digital', 'internet',
                'algorithm', 'innovation', 'engineering', 'technical',
                'machine', 'automation', 'electronics', 'invention'
            ]
        }

        # Quality indicators
        self.quality_indicators = {
            'positive': [
                'evidence', 'research', 'study', 'analysis', 'peer-reviewed',
                'academic', 'scholarly', 'university', 'institute', 'journal'
            ],
            'negative': [
                'unverified', 'rumor', 'gossip', 'speculation', 'opinion',
                'conspiracy', 'myth', 'fake', 'false', 'misleading'
            ]
        }

    async def initialize(self):
        """Initialize NLP models and resources."""
        logger.info("Initializing knowledge processor...")

        # Download required NLTK data
        try:
            nltk.download('punkt', quiet=True)
            nltk.download('stopwords', quiet=True)
            nltk.download('wordnet', quiet=True)
            nltk.download('averaged_perceptron_tagger', quiet=True)
        except Exception as e:
            logger.warning(f"Failed to download some NLTK data: {e}")

        # Load spaCy model
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            logger.warning("spaCy model not found, downloading...")
            spacy.cli.download("en_core_web_sm")
            self.nlp = spacy.load("en_core_web_sm")

        # Load embedding model
        self.embedding_model = SentenceTransformer(
            "sentence-transformers/all-MiniLM-L6-v2",
            device=self.device
        )

        # Load summarization model
        self.summarizer = pipeline(
            "summarization",
            model="facebook/bart-large-cnn",
            device=0 if self.device.type == "cuda" else -1
        )

        # Load text classification model
        self.classifier = pipeline(
            "zero-shot-classification",
            model="facebook/bart-large-mnli",
            device=0 if self.device.type == "cuda" else -1
        )

        logger.info("Knowledge processor initialized successfully")

    async def process_text_file(
        self,
        file_path: Path,
        title: Optional[str] = None,
        source_metadata: Optional[Dict[str, Any]] = None
    ) -> ProcessedKnowledge:
        """
        Process a text file and extract structured knowledge.

        Args:
            file_path: Path to the text file
            title: Optional title (will be extracted if not provided)
            source_metadata: Additional metadata about the source

        Returns:
            ProcessedKnowledge object
        """
        logger.info(f"Processing text file: {file_path}")

        # Read file content
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                raw_content = f.read()
        except Exception as e:
            logger.error(f"Failed to read file {file_path}: {e}")
            raise

        # Detect and clean text format
        cleaned_content = await self._clean_text(raw_content)

        # Extract title if not provided
        if not title:
            title = await self._extract_title(cleaned_content, file_path.name)

        # Process the content
        return await self._process_content(
            title=title,
            content=cleaned_content,
            source_metadata=source_metadata or {}
        )

    async def process_web_content(
        self,
        html_content: str,
        title: Optional[str] = None,
        url: Optional[str] = None
    ) -> ProcessedKnowledge:
        """Process HTML content from web sources."""
        # Extract text from HTML
        soup = BeautifulSoup(html_content, 'html.parser')

        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'footer', 'aside']):
            element.decompose()

        # Extract title
        if not title:
            title_elem = soup.find('title')
            title = title_elem.get_text(strip=True) if title_elem else "Web Content"

        # Extract main content
        main_content = soup.get_text(separator='\n', strip=True)
        cleaned_content = await self._clean_text(main_content)

        source_metadata = {'source_type': 'web', 'url': url}
        return await self._process_content(title, cleaned_content, source_metadata)

    async def _process_content(
        self,
        title: str,
        content: str,
        source_metadata: Dict[str, Any]
    ) -> ProcessedKnowledge:
        """Core content processing logic."""

        # Analyze content structure
        chunks = await self._chunk_text(content)

        # Extract concepts and keywords
        concepts = await self._extract_concepts(content)
        keywords = await self._extract_keywords(content)

        # Classify content
        category, subcategory = await self._classify_content(content, title)

        # Calculate quality scores
        quality_score = await self._calculate_quality_score(content, title)
        complexity_score = await self._calculate_complexity_score(content)

        # Generate summary
        summary = await self._generate_summary(content)

        # Generate embeddings
        content_embedding = await self._generate_embedding(content)

        # Process chunks with embeddings
        processed_chunks = []
        for chunk in chunks:
            chunk_embedding = await self._generate_embedding(chunk.content)
            chunk_dict = {
                'content': chunk.content,
                'start_pos': chunk.start_pos,
                'end_pos': chunk.end_pos,
                'chunk_type': chunk.chunk_type,
                'importance_score': chunk.importance_score,
                'concepts': chunk.concepts,
                'embedding': chunk_embedding.tolist() if chunk_embedding is not None else None
            }
            processed_chunks.append(chunk_dict)

        # Prepare metadata
        metadata = {
            **source_metadata,
            'processing_timestamp': str(asyncio.get_event_loop().time()),
            'word_count': len(content.split()),
            'sentence_count': len(self.sentence_splitter.split(content)),
            'paragraph_count': len(self.paragraph_splitter.split(content)),
            'readability_score': textstat.flesch_reading_ease(content),
            'language': 'en'  # Could be detected
        }

        return ProcessedKnowledge(
            title=title,
            content=content,
            summary=summary,
            category=category,
            subcategory=subcategory,
            keywords=keywords,
            concepts=concepts,
            quality_score=quality_score,
            complexity_score=complexity_score,
            embedding=content_embedding,
            chunks=processed_chunks,
            metadata=metadata
        )

    async def _clean_text(self, raw_content: str) -> str:
        """Clean and normalize text content."""
        # Remove excessive whitespace
        content = re.sub(r'\n\s*\n\s*\n', '\n\n', raw_content)
        content = re.sub(r'[ \t]+', ' ', content)

        # Remove common Gutenberg headers/footers
        content = re.sub(
            r'\*\*\*\s*START OF .*?\*\*\*.*?\n',
            '',
            content,
            flags=re.DOTALL | re.IGNORECASE
        )
        content = re.sub(
            r'\*\*\*\s*END OF .*?\*\*\*.*',
            '',
            content,
            flags=re.DOTALL | re.IGNORECASE
        )

        # Remove page numbers and chapter markers that might interfere
        content = re.sub(r'\n\s*\d+\s*\n', '\n', content)
        content = re.sub(r'\n\s*Page \d+\s*\n', '\n', content, flags=re.IGNORECASE)

        # Normalize quotes and dashes
        content = content.replace('"', '"').replace('"', '"')
        content = content.replace(''', "'").replace(''', "'")
        content = content.replace('—', '--').replace('–', '-')

        return content.strip()

    async def _extract_title(self, content: str, filename: str) -> str:
        """Extract title from content or filename."""
        lines = content.split('\n')[:10]  # Check first 10 lines

        # Look for title patterns
        for line in lines:
            line = line.strip()
            if len(line) > 10 and len(line) < 100:
                # Check if line looks like a title
                if line.isupper() or line.istitle():
                    return line

        # Extract from filename as fallback
        title = filename.replace('_', ' ').replace('-', ' ')
        title = re.sub(r'\.[^.]+$', '', title)  # Remove extension
        title = re.sub(r'^\d+_?', '', title)   # Remove leading numbers

        return title.title()

    async def _chunk_text(self, content: str) -> List[TextChunk]:
        """Split text into meaningful chunks."""
        chunks = []
        paragraphs = self.paragraph_splitter.split(content)

        current_pos = 0
        for paragraph in paragraphs:
            if len(paragraph.strip()) < 50:  # Skip very short paragraphs
                current_pos += len(paragraph) + 2  # +2 for newlines
                continue

            # Determine chunk type
            chunk_type = self._determine_chunk_type(paragraph)

            # Calculate importance score
            importance_score = await self._calculate_chunk_importance(paragraph)

            # Extract concepts from chunk
            chunk_concepts = await self._extract_chunk_concepts(paragraph)

            chunk = TextChunk(
                content=paragraph.strip(),
                start_pos=current_pos,
                end_pos=current_pos + len(paragraph),
                chunk_type=chunk_type,
                importance_score=importance_score,
                concepts=chunk_concepts
            )

            chunks.append(chunk)
            current_pos += len(paragraph) + 2

        return chunks

    def _determine_chunk_type(self, paragraph: str) -> str:
        """Determine the type of text chunk."""
        if len(paragraph) < 100:
            return 'short_paragraph'
        elif any(keyword in paragraph.lower() for keyword in ['chapter', 'section', 'part']):
            return 'section_header'
        elif paragraph.strip().endswith(':'):
            return 'list_header'
        else:
            return 'paragraph'

    async def _calculate_chunk_importance(self, chunk: str) -> float:
        """Calculate importance score for a text chunk."""
        score = 0.5  # Base score

        # Length factor (not too short, not too long)
        length = len(chunk.split())
        if 50 <= length <= 200:
            score += 0.1
        elif length < 20:
            score -= 0.2

        # Keyword density
        important_words = [
            'important', 'significant', 'crucial', 'essential', 'key',
            'fundamental', 'principle', 'concept', 'theory', 'discovery'
        ]
        keyword_count = sum(1 for word in important_words if word in chunk.lower())
        score += min(0.3, keyword_count * 0.1)

        # Question presence (often indicates important information)
        question_count = chunk.count('?')
        score += min(0.2, question_count * 0.05)

        # Technical terms (using simple heuristic)
        doc = self.nlp(chunk[:1000])  # Limit for performance
        technical_terms = [
            token for token in doc
            if token.pos_ in ['NOUN', 'PROPN'] and len(token.text) > 6
        ]
        score += min(0.2, len(technical_terms) * 0.01)

        return min(1.0, max(0.0, score))

    async def _extract_concepts(self, content: str) -> List[str]:
        """Extract key concepts from content."""
        doc = self.nlp(content[:5000])  # Limit for performance

        # Extract noun phrases as concepts
        concepts = []
        for chunk in doc.noun_chunks:
            if len(chunk.text) > 3 and len(chunk.text.split()) <= 3:
                concepts.append(chunk.text.lower())

        # Extract named entities
        for ent in doc.ents:
            if ent.label_ in ['PERSON', 'ORG', 'GPE', 'EVENT', 'WORK_OF_ART']:
                concepts.append(ent.text.lower())

        # Remove duplicates and return top concepts
        concept_counts = Counter(concepts)
        return [concept for concept, count in concept_counts.most_common(20)]

    async def _extract_chunk_concepts(self, chunk: str) -> List[str]:
        """Extract concepts from a specific chunk."""
        doc = self.nlp(chunk[:1000])  # Limit for performance

        concepts = []
        for chunk_span in doc.noun_chunks:
            if len(chunk_span.text) > 3:
                concepts.append(chunk_span.text.lower())

        for ent in doc.ents:
            concepts.append(ent.text.lower())

        return list(set(concepts))[:10]  # Return unique concepts, limited

    async def _extract_keywords(self, content: str) -> List[str]:
        """Extract keywords from content."""
        doc = self.nlp(content[:5000])  # Limit for performance

        # Extract meaningful words
        keywords = []
        for token in doc:
            if (token.pos_ in ['NOUN', 'ADJ', 'VERB'] and
                not token.is_stop and
                not token.is_punct and
                len(token.text) > 3):
                keywords.append(token.lemma_.lower())

        # Count frequency and return top keywords
        keyword_counts = Counter(keywords)
        return [word for word, count in keyword_counts.most_common(15)]

    async def _classify_content(self, content: str, title: str) -> Tuple[str, Optional[str]]:
        """Classify content into categories."""
        # Combine title and first part of content for classification
        classification_text = f"{title}. {content[:1000]}"

        # Use keyword-based classification first (faster)
        category_scores = {}
        for category, keywords in self.category_keywords.items():
            score = sum(1 for keyword in keywords if keyword in classification_text.lower())
            category_scores[category] = score

        if category_scores and max(category_scores.values()) > 0:
            category = max(category_scores, key=category_scores.get)
        else:
            # Fallback to ML classification
            categories = list(self.category_keywords.keys())
            try:
                result = self.classifier(classification_text, categories)
                category = result['labels'][0]
            except Exception as e:
                logger.warning(f"Classification failed: {e}")
                category = 'general'

        # Determine subcategory based on more specific analysis
        subcategory = await self._determine_subcategory(content, category)

        return category, subcategory

    async def _determine_subcategory(self, content: str, category: str) -> Optional[str]:
        """Determine subcategory based on content analysis."""
        subcategory_mapping = {
            'science': {
                'physics': ['physics', 'quantum', 'relativity', 'mechanics'],
                'biology': ['biology', 'evolution', 'genetics', 'species'],
                'chemistry': ['chemistry', 'chemical', 'molecule', 'reaction'],
                'astronomy': ['astronomy', 'space', 'universe', 'planet', 'star']
            },
            'history': {
                'ancient': ['ancient', 'rome', 'greece', 'egypt', 'civilization'],
                'medieval': ['medieval', 'middle ages', 'feudal', 'knight'],
                'modern': ['modern', 'industrial', 'revolution', 'war', 'century']
            },
            'literature': {
                'fiction': ['novel', 'story', 'character', 'plot'],
                'poetry': ['poem', 'verse', 'rhyme', 'stanza'],
                'drama': ['play', 'theater', 'act', 'scene']
            }
        }

        if category in subcategory_mapping:
            content_lower = content[:2000].lower()
            subcategory_scores = {}

            for subcategory, keywords in subcategory_mapping[category].items():
                score = sum(1 for keyword in keywords if keyword in content_lower)
                subcategory_scores[subcategory] = score

            if subcategory_scores and max(subcategory_scores.values()) > 0:
                return max(subcategory_scores, key=subcategory_scores.get)

        return None

    async def _calculate_quality_score(self, content: str, title: str) -> float:
        """Calculate quality score for content."""
        score = 0.5  # Base score

        # Content length (optimal range)
        word_count = len(content.split())
        if 500 <= word_count <= 10000:
            score += 0.1
        elif word_count < 100:
            score -= 0.2

        # Readability
        try:
            readability = textstat.flesch_reading_ease(content)
            if 30 <= readability <= 70:  # Reasonable complexity
                score += 0.1
        except:
            pass

        # Quality indicators
        content_lower = content.lower()
        positive_indicators = sum(
            1 for indicator in self.quality_indicators['positive']
            if indicator in content_lower
        )
        negative_indicators = sum(
            1 for indicator in self.quality_indicators['negative']
            if indicator in content_lower
        )

        score += min(0.2, positive_indicators * 0.05)
        score -= min(0.3, negative_indicators * 0.1)

        # Title quality
        if len(title.split()) >= 3 and not title.isupper():
            score += 0.05

        return min(1.0, max(0.0, score))

    async def _calculate_complexity_score(self, content: str) -> float:
        """Calculate complexity score for content."""
        try:
            # Use various readability metrics
            flesch_score = textstat.flesch_reading_ease(content)
            flesch_kincaid = textstat.flesch_kincaid_grade(content)

            # Normalize to 0-1 scale
            complexity = 1.0 - (flesch_score / 100.0)
            complexity = max(0.0, min(1.0, complexity))

            return complexity
        except:
            return 0.5  # Default complexity

    async def _generate_summary(self, content: str) -> str:
        """Generate summary of content."""
        try:
            # Limit content length for summarization
            max_length = 1024
            if len(content) > max_length:
                # Take first part of content
                content_to_summarize = content[:max_length]
            else:
                content_to_summarize = content

            # Generate summary
            summary_result = self.summarizer(
                content_to_summarize,
                max_length=150,
                min_length=50,
                do_sample=False
            )

            return summary_result[0]['summary_text']

        except Exception as e:
            logger.warning(f"Summarization failed: {e}")
            # Fallback: return first few sentences
            sentences = self.sentence_splitter.split(content)[:3]
            return ' '.join(sentences)

    async def _generate_embedding(self, text: str) -> Optional[np.ndarray]:
        """Generate embedding for text."""
        try:
            # Limit text length
            if len(text) > 500:
                text = text[:500]

            embedding = self.embedding_model.encode(text, convert_to_numpy=True)
            return embedding

        except Exception as e:
            logger.warning(f"Embedding generation failed: {e}")
            return None

    def get_processing_statistics(self) -> Dict[str, Any]:
        """Get statistics about processed knowledge."""
        return {
            'models_loaded': {
                'nlp': self.nlp is not None,
                'embedding_model': self.embedding_model is not None,
                'summarizer': self.summarizer is not None,
                'classifier': self.classifier is not None
            },
            'chunk_size': self.chunk_size,
            'chunk_overlap': self.chunk_overlap,
            'supported_categories': list(self.category_keywords.keys()),
            'device': str(self.device)
        }