Added another learning source for Nora. Also added the requirements.

This commit is contained in:
2025-06-09 14:25:11 -04:00
parent da23742671
commit 5d53ba7cb8
14 changed files with 1070 additions and 78 deletions

107
knowledge_retriever.py Normal file
View File

@ -0,0 +1,107 @@
# knowledge_retriever.py
import os
import re
import requests
from bs4 import BeautifulSoup
import logging
# Where to dump new “.txt” files scraped from the web
SCRAPE_DIR = "data/books/scraped"
# Simple ratelimiter to avoid hammering any one domain
import time
_last_request_time = {}
def fetch_url(url: str, min_interval: float = 1.0) -> str:
"""
Fetch a URLs HTML, enforcing at least `min_interval` seconds between requests
to the same domain. Returns HTML string or empty string on failure.
"""
from urllib.parse import urlparse
domain = urlparse(url).netloc
now = time.time()
last = _last_request_time.get(domain, 0)
wait = min_interval - (now - last)
if wait > 0:
time.sleep(wait)
headers = {"User-Agent": "NoraScraper/1.0 (+https://your_project_url)"}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
_last_request_time[domain] = time.time()
return response.text
except Exception as e:
logging.error(f"Error fetching {url}: {e}")
return ""
def clean_html(html: str) -> str:
"""
Strip scripts, styles, and tags; return plain text.
"""
soup = BeautifulSoup(html, "html.parser")
# remove scripts and styles
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
text = soup.get_text(separator="\n")
# collapse multiple blank lines
lines = [line.strip() for line in text.splitlines()]
text = "\n".join([line for line in lines if line])
return text
def save_text(content: str, title: str):
"""
Save content to a UTF-8 .txt file under SCRAPE_DIR. Filename is derived from title.
"""
os.makedirs(SCRAPE_DIR, exist_ok=True)
# sanitize title → filename
safe = re.sub(r"[^0-9a-zA-Z_\-]", "_", title)
fname = f"{safe[:50]}.txt"
path = os.path.join(SCRAPE_DIR, fname)
with open(path, "w", encoding="utf-8") as f:
f.write(content)
logging.info(f"Saved scraped page to {path}")
def scrape_and_store(url: str):
"""
High-level function: fetches URL, cleans HTML, extracts a title, and saves to a .txt.
"""
html = fetch_url(url)
if not html:
return False
text = clean_html(html)
# extract <title> if present
title = ""
m = re.search(r"<title>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
if m:
title = m.group(1).strip()
else:
title = url
save_text(text, title)
return True
# Example usage:
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python knowledge_retriever.py <url1> [<url2> ...]")
sys.exit(1)
for link in sys.argv[1:]:
success = scrape_and_store(link)
if success:
print(f"Scraped: {link}")
else:
print(f"Failed to scrape: {link}")