# knowledge_retriever.py import os import re import requests from bs4 import BeautifulSoup import logging # Where to dump new “.txt” files scraped from the web SCRAPE_DIR = "data/books/scraped" # Simple rate‐limiter to avoid hammering any one domain import time _last_request_time = {} def fetch_url(url: str, min_interval: float = 1.0) -> str: """ Fetch a URL’s HTML, enforcing at least `min_interval` seconds between requests to the same domain. Returns HTML string or empty string on failure. """ from urllib.parse import urlparse domain = urlparse(url).netloc now = time.time() last = _last_request_time.get(domain, 0) wait = min_interval - (now - last) if wait > 0: time.sleep(wait) headers = {"User-Agent": "NoraScraper/1.0 (+https://your_project_url)"} try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() _last_request_time[domain] = time.time() return response.text except Exception as e: logging.error(f"Error fetching {url}: {e}") return "" def clean_html(html: str) -> str: """ Strip scripts, styles, and tags; return plain text. """ soup = BeautifulSoup(html, "html.parser") # remove scripts and styles for tag in soup(["script", "style", "noscript"]): tag.decompose() text = soup.get_text(separator="\n") # collapse multiple blank lines lines = [line.strip() for line in text.splitlines()] text = "\n".join([line for line in lines if line]) return text def save_text(content: str, title: str): """ Save content to a UTF-8 .txt file under SCRAPE_DIR. Filename is derived from title. """ os.makedirs(SCRAPE_DIR, exist_ok=True) # sanitize title → filename safe = re.sub(r"[^0-9a-zA-Z_\-]", "_", title) fname = f"{safe[:50]}.txt" path = os.path.join(SCRAPE_DIR, fname) with open(path, "w", encoding="utf-8") as f: f.write(content) logging.info(f"Saved scraped page to {path}") def scrape_and_store(url: str): """ High-level function: fetches URL, cleans HTML, extracts a title, and saves to a .txt. """ html = fetch_url(url) if not html: return False text = clean_html(html) # extract