Commit 57750e01 authored by Administrator's avatar Administrator

Update 4 files via Son of Anton

parent 6774dcff
...@@ -37,4 +37,10 @@ BEDROCK_ENDPOINT: str = ( ...@@ -37,4 +37,10 @@ BEDROCK_ENDPOINT: str = (
f"https://bedrock-runtime.{AWS_REGION}.amazonaws.com" f"https://bedrock-runtime.{AWS_REGION}.amazonaws.com"
) )
APP_VERSION: str = "4.0.0" # SerpAPI for web search
\ No newline at end of file SERPAPI_KEY: str = os.getenv(
"SERPAPI_KEY",
"0f9efa98fb0fe7b27af609e8dd80e04c4af1e098ec81fe628a6d63aaaebe8bd6",
)
APP_VERSION: str = "4.2.0"
\ No newline at end of file
""" """
Web Search Service — v4.1.1 — Robust multi-strategy with proper region handling. Web Search Service — v4.2.0 — SerpAPI (Google Search)
Primary: duckduckgo-search library (forced US-English) Clean. Reliable. No scraping bullshit.
Fallback 1: DDG HTML scraping with anti-detection
Fallback 2: DDG Lite scraping
Fallback 3: Brave Search HTML scraping
""" """
import re
import asyncio import asyncio
import random
import logging import logging
from urllib.parse import quote_plus, urlparse, parse_qs from urllib.parse import urlencode
from concurrent.futures import ThreadPoolExecutor
import httpx import httpx
from backend.config import SERPAPI_KEY
logger = logging.getLogger("son_of_anton.web_search") logger = logging.getLogger("son_of_anton.web_search")
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
# duckduckgo-search library (primary) # SerpAPI Configuration
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
try: SERPAPI_BASE = "https://serpapi.com/search"
from duckduckgo_search import DDGS
HAS_DDGS = True
logger.info("duckduckgo-search library available — using as primary search method")
except ImportError:
HAS_DDGS = False
logger.warning("duckduckgo-search library NOT installed — falling back to HTML scraping")
_executor = ThreadPoolExecutor(max_workers=2)
_USER_AGENTS = [ _USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
] ]
# Domains that are NEVER useful in English search results # Domains we never want cluttering results
BLOCKED_DOMAINS = { BLOCKED_DOMAINS = {
"baidu.com", "zhihu.com", "csdn.net", "bilibili.com", "weibo.com", "baidu.com", "zhihu.com", "csdn.net", "bilibili.com", "weibo.com",
"sogou.com", "163.com", "qq.com", "taobao.com", "jd.com", "sogou.com", "163.com", "qq.com", "taobao.com", "jd.com",
"douyin.com", "tiktok.com", "yandex.ru", "mail.ru", "douyin.com", "tiktok.com", "yandex.ru", "mail.ru",
"duckduckgo.com", "search.brave.com", "google.com/search",
} }
def _get_headers():
return {
"User-Agent": random.choice(_USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
}
def _is_valid_result(url: str, title: str) -> bool: def _is_valid_result(url: str, title: str) -> bool:
"""Filter out garbage results — non-English, ad domains, empty junk.""" """Filter garbage results."""
if not url or not url.startswith("http"): if not url or not url.startswith("http"):
return False return False
if not title or len(title.strip()) < 3: if not title or len(title.strip()) < 3:
return False return False
try: try:
parsed = urlparse(url) from urllib.parse import urlparse
domain = parsed.netloc.lower().lstrip("www.") domain = urlparse(url).netloc.lower().lstrip("www.")
for blocked in BLOCKED_DOMAINS:
if domain == blocked or domain.endswith("." + blocked):
return False
except Exception: except Exception:
return False pass
# Block known garbage domains
for blocked in BLOCKED_DOMAINS:
if domain == blocked or domain.endswith("." + blocked):
return False
# Block results that are clearly search engine pages, not actual results
if "/search?" in url or "/search/" in url:
return False
return True return True
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
# Main entry point # Main Entry Point
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> str: async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> str:
""" """
Search the web using multiple strategies with automatic fallback. Search the web via SerpAPI (Google).
Returns formatted search results as a string for the LLM context. Returns formatted search results string for LLM context.
""" """
logger.info(f"Web search initiated: query='{query[:80]}', num_results={num_results}") logger.info(f"Web search initiated: query='{query[:80]}', num_results={num_results}")
results = [] if not SERPAPI_KEY:
logger.error("SERPAPI_KEY not configured!")
# Strategy 1: duckduckgo-search library (most reliable) return f"[Web search failed: SerpAPI key not configured. Answer from your own knowledge.]"
if HAS_DDGS:
results = await _ddgs_library_search(query, num_results)
if results:
logger.info(f"DDGS library returned {len(results)} results")
# Strategy 2: DDG HTML scraping
if not results:
logger.info("DDGS library failed or unavailable, trying HTML scraping...")
results = await _ddg_html_search(query, num_results)
if results:
logger.info(f"DDG HTML scraping returned {len(results)} results")
# Strategy 3: DDG Lite
if not results:
logger.info("HTML scraping failed, trying DDG Lite...")
results = await _ddg_lite_search(query, num_results)
if results:
logger.info(f"DDG Lite returned {len(results)} results")
# Strategy 4: Brave Search results = await _serpapi_search(query, num_results)
if not results:
logger.info("DDG Lite failed, trying Brave Search...")
results = await _brave_html_search(query, num_results)
if results:
logger.info(f"Brave HTML search returned {len(results)} results")
# Strategy 5: Google scraping (last resort)
if not results: if not results:
logger.info("Brave failed, trying Google scraping...") logger.warning(f"SerpAPI returned no results for: '{query[:80]}'")
results = await _google_html_search(query, num_results) return f"[Web search for '{query}' returned no results. Answer from your own knowledge.]"
if results:
logger.info(f"Google HTML search returned {len(results)} results")
if not results: logger.info(f"SerpAPI returned {len(results)} results for: '{query[:60]}'")
logger.warning(f"ALL search strategies failed for query: '{query[:80]}'")
return f"[Web search for '{query}' returned no results. All search strategies exhausted. Answer from your own knowledge.]"
# Format results # Format results
lines = [ lines = [
"═" * 60, "═" * 60,
"WEB SEARCH RESULTS", "WEB SEARCH RESULTS (via Google/SerpAPI)",
f"Query: {query}", f"Query: {query}",
f"Results: {len(results)}", f"Results: {len(results)}",
"═" * 60, "", "═" * 60, "",
] ]
for i, r in enumerate(results, 1): for i, r in enumerate(results, 1):
lines.append(f"[{i}] {r['title']}") lines.append(f"[{i}] {r['title']}")
lines.append(f" URL: {r['url']}") if r.get("url"):
lines.append(f" URL: {r['url']}")
if r.get("snippet"): if r.get("snippet"):
lines.append(f" {r['snippet']}") lines.append(f" {r['snippet']}")
if r.get("date"):
lines.append(f" Date: {r['date']}")
lines.append("") lines.append("")
# Fetch full page content for top results # Fetch full page content for top results
if fetch_pages > 0: if fetch_pages > 0:
detailed = await _fetch_pages(results[:fetch_pages]) # Only fetch actual URLs (skip knowledge graph entries without URLs)
fetchable = [r for r in results if r.get("url") and r["url"].startswith("http")]
detailed = await _fetch_pages(fetchable[:fetch_pages])
if detailed: if detailed:
lines.append("═" * 60) lines.append("═" * 60)
lines.append("DETAILED PAGE CONTENT:") lines.append("DETAILED PAGE CONTENT:")
...@@ -174,408 +109,203 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> ...@@ -174,408 +109,203 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) ->
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
# Strategy 1: duckduckgo-search library # SerpAPI Google Search
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
def _ddgs_sync_search(query: str, n: int) -> list[dict]: async def _serpapi_search(query: str, n: int) -> list[dict]:
"""Synchronous DDGS search — runs in thread executor.""" """Hit SerpAPI's Google Search endpoint and parse results."""
results = []
# Try multiple configurations in order
configs = [
{"region": "us-en", "backend": "auto"},
{"region": "us-en", "backend": "html"},
{"region": "us-en", "backend": "lite"},
{"region": "wt-wt", "backend": "auto"},
]
for cfg in configs:
if results:
break
try:
with DDGS() as ddgs:
kwargs = {
"keywords": query,
"max_results": n + 5, # Request extra, filter later
"region": cfg["region"],
"safesearch": "moderate",
}
# backend parameter exists in newer versions
try:
kwargs["backend"] = cfg["backend"]
raw = list(ddgs.text(**kwargs))
except TypeError:
# Older version without backend parameter
del kwargs["backend"]
raw = list(ddgs.text(**kwargs))
for r in raw:
title = r.get("title", "").strip()
url = r.get("href", r.get("link", "")).strip()
snippet = r.get("body", r.get("snippet", "")).strip()
if _is_valid_result(url, title):
results.append({"title": title, "url": url, "snippet": snippet})
if len(results) >= n:
break
if results:
logger.info(f"DDGS config {cfg} returned {len(results)} valid results")
return results
except Exception as e:
logger.warning(f"DDGS config {cfg} failed: {type(e).__name__}: {e}")
continue
return results
async def _ddgs_library_search(query: str, n: int) -> list[dict]:
"""Run the synchronous DDGS search in a thread pool."""
if not HAS_DDGS:
return []
try: try:
loop = asyncio.get_event_loop() params = {
return await asyncio.wait_for( "q": query,
loop.run_in_executor(_executor, _ddgs_sync_search, query, n), "api_key": SERPAPI_KEY,
timeout=30.0, "engine": "google",
) "num": min(n + 5, 20),
except asyncio.TimeoutError: "hl": "en",
logger.error("DDGS library search timed out") "gl": "us",
return [] "safe": "off",
except Exception as e: "no_cache": "false",
logger.error(f"DDGS executor error: {e}") }
return []
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.get(SERPAPI_BASE, params=params)
# ═══════════════════════════════════════════════════
# Strategy 2: DDG HTML scraping (improved)
# ═══════════════════════════════════════════════════
async def _ddg_html_search(query: str, n: int) -> list[dict]:
try:
headers = _get_headers()
headers["Referer"] = "https://duckduckgo.com/"
headers["Origin"] = "https://html.duckduckgo.com"
headers["Content-Type"] = "application/x-www-form-urlencoded"
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
http2=False,
) as client:
# First, hit the main page to get cookies
try:
await client.get("https://html.duckduckgo.com/html/", headers=_get_headers())
except Exception:
pass
await asyncio.sleep(random.uniform(0.3, 1.0))
resp = await client.post(
"https://html.duckduckgo.com/html/",
data={"q": query, "b": "", "kl": "us-en", "kp": "-1"},
headers=headers,
)
if resp.status_code != 200: if resp.status_code != 200:
logger.warning(f"DDG HTML returned status {resp.status_code}") error_text = resp.text[:500]
logger.error(f"SerpAPI HTTP {resp.status_code}: {error_text}")
try:
err_data = resp.json()
logger.error(f"SerpAPI error: {err_data.get('error', 'unknown')}")
except Exception:
pass
return [] return []
html = resp.text data = resp.json()
if len(html) < 500: if "error" in data:
logger.warning(f"DDG HTML response suspiciously short: {len(html)} chars") logger.error(f"SerpAPI error: {data['error']}")
return [] return []
if "Please try again" in html or "bot" in html.lower()[:500]:
logger.warning("DDG HTML appears to have returned a bot-detection page")
return []
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
results = [] results = []
# Try multiple selector strategies # ── Answer box / Featured snippet (highest priority) ──
selectors = [ answer = data.get("answer_box", {})
(".result__body", ".result__title a, .result__a", ".result__snippet"), if answer:
(".result", ".result__a", ".result__snippet"), answer_text = (
(".web-result", ".result__a", ".result__snippet"), answer.get("answer", "")
(".results .result", "a.result__a", ".result__snippet"), or answer.get("snippet", "")
] or answer.get("result", "")
for container_sel, link_sel, snippet_sel in selectors:
elements = soup.select(container_sel)
if not elements:
continue
for el in elements:
title_a = el.select_one(link_sel)
snippet_el = el.select_one(snippet_sel)
if not title_a:
continue
title = title_a.get_text(strip=True)
href = title_a.get("href", "")
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
# DDG wraps URLs in a redirect
if "uddg=" in href:
parsed = parse_qs(urlparse(href).query)
href = parsed.get("uddg", [href])[0]
elif href.startswith("//duckduckgo.com/l/?"):
parsed = parse_qs(urlparse(href).query)
href = parsed.get("uddg", parsed.get("u", [href]))[0]
if _is_valid_result(href, title):
results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n:
break
if results:
break
# Last resort: find ANY links that look like search results
if not results:
for a_tag in soup.find_all("a", href=True):
href = a_tag.get("href", "")
text = a_tag.get_text(strip=True)
if "uddg=" in href:
parsed = parse_qs(urlparse(href).query)
real_url = parsed.get("uddg", [""])[0]
if _is_valid_result(real_url, text):
results.append({"title": text, "url": real_url, "snippet": ""})
if len(results) >= n:
break
logger.info(f"DDG HTML parsed {len(results)} results from {len(html)} chars of HTML")
return results
except Exception as e:
logger.error(f"DDG HTML search error: {type(e).__name__}: {e}")
return []
# ═══════════════════════════════════════════════════
# Strategy 3: DDG Lite
# ═══════════════════════════════════════════════════
async def _ddg_lite_search(query: str, n: int) -> list[dict]:
try:
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
) as client:
resp = await client.post(
"https://lite.duckduckgo.com/lite/",
data={"q": query, "kl": "us-en"},
headers=_get_headers(),
) )
if resp.status_code != 200: if answer_text:
logger.warning(f"DDG Lite returned status {resp.status_code}") results.append({
return [] "title": f"[Featured Answer] {answer.get('title', query)}",
"url": answer.get("link", ""),
html = resp.text "snippet": str(answer_text)[:500],
"date": "",
from bs4 import BeautifulSoup })
soup = BeautifulSoup(html, "html.parser")
results = [] # ── Knowledge graph ──
kg = data.get("knowledge_graph", {})
for a_tag in soup.select("a.result-link"): if kg and kg.get("title"):
href = a_tag.get("href", "") kg_parts = []
title = a_tag.get_text(strip=True) if kg.get("description"):
if not _is_valid_result(href, title): kg_parts.append(kg["description"])
continue if kg.get("type"):
kg_parts.append(f"Type: {kg['type']}")
snippet = ""
parent_tr = a_tag.find_parent("tr") for key in ["born", "died", "founded", "headquarters", "ceo",
if parent_tr: "revenue", "employees", "website", "nationality",
next_tr = parent_tr.find_next_sibling("tr") "genre", "awards", "education", "height", "weight",
if next_tr: "capital", "population", "area", "currency",
snippet_td = next_tr.find("td", class_="result-snippet") "president", "prime_minister"]:
if snippet_td: val = kg.get(key)
snippet = snippet_td.get_text(strip=True)[:300] if val:
elif next_tr: kg_parts.append(f"{key.replace('_', ' ').title()}: {val}")
snippet = next_tr.get_text(strip=True)[:300]
# Also grab "known attributes" list if present
results.append({"title": title, "url": href, "snippet": snippet}) for attr in kg.get("attributes", {}).items():
if len(results) >= n: if len(kg_parts) < 15:
kg_parts.append(f"{attr[0]}: {attr[1]}")
if kg_parts:
results.append({
"title": f"[Knowledge Graph] {kg['title']}",
"url": kg.get("website", kg.get("source", {}).get("link", "")),
"snippet": " | ".join(kg_parts),
"date": "",
})
# ── Organic results (main search results) ──
for item in data.get("organic_results", []):
title = item.get("title", "").strip()
url = item.get("link", "").strip()
snippet = item.get("snippet", "").strip()
date = item.get("date", "")
# Rich snippet extras
rich_snippet = item.get("rich_snippet", {})
if rich_snippet:
top = rich_snippet.get("top", {})
if top.get("detected_extensions", {}).get("rating"):
snippet += f" | Rating: {top['detected_extensions']['rating']}"
if _is_valid_result(url, title):
results.append({
"title": title,
"url": url,
"snippet": snippet,
"date": date,
})
if len(results) >= n + 3: # +3 for KG/answer/news
break break
# Fallback: any td a[href^='http'] # ── Top stories (news) ──
if not results: for story in data.get("top_stories", [])[:3]:
for a_tag in soup.select("td a[href^='http']"): title = story.get("title", "").strip()
href = a_tag.get("href", "") url = story.get("link", "").strip()
title = a_tag.get_text(strip=True) source = story.get("source", "")
if _is_valid_result(href, title) and len(title) > 5: date = story.get("date", "")
results.append({"title": title, "url": href, "snippet": ""})
if len(results) >= n: if url and title:
break results.append({
"title": f"[News] {title}",
return results "url": url,
"snippet": f"Source: {source}" + (f" | {date}" if date else ""),
except Exception as e: "date": date,
logger.error(f"DDG Lite search error: {type(e).__name__}: {e}") })
return []
# ── Related questions (People Also Ask) ──
paa = data.get("related_questions", [])
# ═══════════════════════════════════════════════════ if paa:
# Strategy 4: Brave Search HTML scraping paa_lines = []
# ═══════════════════════════════════════════════════ for q in paa[:4]:
question = q.get("question", "")
async def _brave_html_search(query: str, n: int) -> list[dict]: paa_snippet = q.get("snippet", "")
"""Scrape Brave Search as a fallback.""" if question:
try: entry = question
headers = _get_headers() if paa_snippet:
headers["Referer"] = "https://search.brave.com/" entry += f" → {paa_snippet[:150]}"
paa_lines.append(entry)
async with httpx.AsyncClient( if paa_lines:
timeout=20.0, results.append({
follow_redirects=True, "title": "[People Also Ask]",
) as client: "url": "",
resp = await client.get( "snippet": " | ".join(paa_lines),
"https://search.brave.com/search", "date": "",
params={"q": query, "source": "web"}, })
headers=headers,
) # ── Related searches ──
if resp.status_code != 200: related = data.get("related_searches", [])
logger.warning(f"Brave Search returned status {resp.status_code}") if related:
return [] related_queries = [r.get("query", "") for r in related[:5] if r.get("query")]
if related_queries:
html = resp.text results.append({
"title": "[Related Searches]",
from bs4 import BeautifulSoup "url": "",
soup = BeautifulSoup(html, "html.parser") "snippet": " | ".join(related_queries),
results = [] "date": "",
})
for item in soup.select(".snippet, [data-type='web']"):
title_el = item.select_one(".snippet-title, .title, a[href^='http']") logger.info(f"SerpAPI: {len(data.get('organic_results', []))} organic, "
desc_el = item.select_one(".snippet-description, .snippet-content, .description") f"KG={'yes' if kg.get('title') else 'no'}, "
f"answer={'yes' if answer else 'no'}, "
if not title_el: f"news={len(data.get('top_stories', []))}, "
continue f"total={len(results)}")
title = title_el.get_text(strip=True)
href = title_el.get("href", "")
if not href.startswith("http"):
link = item.select_one("a[href^='http']")
if link:
href = link.get("href", "")
snippet = desc_el.get_text(strip=True) if desc_el else ""
if _is_valid_result(href, title):
results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n:
break
logger.info(f"Brave Search parsed {len(results)} results")
return results return results
except Exception as e: except httpx.TimeoutException:
logger.error(f"Brave HTML search error: {type(e).__name__}: {e}") logger.error("SerpAPI request timed out")
return [] return []
# ═══════════════════════════════════════════════════
# Strategy 5: Google HTML scraping (last resort)
# ═══════════════════════════════════════════════════
async def _google_html_search(query: str, n: int) -> list[dict]:
"""Scrape Google search results as absolute last resort."""
try:
headers = _get_headers()
headers["Referer"] = "https://www.google.com/"
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
) as client:
resp = await client.get(
"https://www.google.com/search",
params={"q": query, "hl": "en", "gl": "us", "num": str(n + 5)},
headers=headers,
)
if resp.status_code != 200:
logger.warning(f"Google returned status {resp.status_code}")
return []
html = resp.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
results = []
# Google search results are in divs with class 'g' or similar
for g_div in soup.select("div.g, div.tF2Cxc, div[data-sokoban-container]"):
# Find the link
link_el = g_div.select_one("a[href^='http']")
if not link_el:
continue
href = link_el.get("href", "")
# Clean Google redirect URLs
if href.startswith("/url?"):
parsed = parse_qs(urlparse(href).query)
href = parsed.get("q", parsed.get("url", [href]))[0]
# Find title
title_el = g_div.select_one("h3")
title = title_el.get_text(strip=True) if title_el else link_el.get_text(strip=True)
# Find snippet
snippet_el = g_div.select_one(".VwiC3b, .IsZvec, .s3v9rd, span.st")
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
if _is_valid_result(href, title):
results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n:
break
# Fallback: just find any h3 > a patterns
if not results:
for h3 in soup.find_all("h3"):
parent_a = h3.find_parent("a")
if parent_a:
href = parent_a.get("href", "")
if href.startswith("/url?"):
parsed = parse_qs(urlparse(href).query)
href = parsed.get("q", parsed.get("url", [href]))[0]
title = h3.get_text(strip=True)
if _is_valid_result(href, title):
results.append({"title": title, "url": href, "snippet": ""})
if len(results) >= n:
break
logger.info(f"Google HTML parsed {len(results)} results")
return results
except Exception as e: except Exception as e:
logger.error(f"Google HTML search error: {type(e).__name__}: {e}") logger.error(f"SerpAPI search error: {type(e).__name__}: {e}")
return [] return []
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
# Page content fetcher # Page Content Fetcher
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
async def _fetch_pages(results: list[dict]) -> list[dict]: async def _fetch_pages(results: list[dict]) -> list[dict]:
"""Fetch and extract text content from result URLs.""" """Fetch and extract text content from result URLs."""
async def _fetch_one(r: dict) -> dict | None: async def _fetch_one(r: dict) -> dict | None:
url = r.get("url", "")
if not url or not url.startswith("http"):
return None
try: try:
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=12.0, timeout=12.0,
follow_redirects=True, follow_redirects=True,
http2=False, http2=False,
) as client: ) as client:
resp = await client.get(r["url"], headers=_get_headers()) headers = {
"User-Agent": _USER_AGENTS[0],
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
resp = await client.get(url, headers=headers)
if resp.status_code != 200: if resp.status_code != 200:
return None return None
...@@ -589,14 +319,14 @@ async def _fetch_pages(results: list[dict]) -> list[dict]: ...@@ -589,14 +319,14 @@ async def _fetch_pages(results: list[dict]) -> list[dict]:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
# Remove non-content elements # Remove noise
for tag in soup(["script", "style", "nav", "footer", "header", "aside", for tag in soup(["script", "style", "nav", "footer", "header", "aside",
"form", "noscript", "svg", "iframe", "button", "input", "form", "noscript", "svg", "iframe", "button", "input",
"select", "textarea", "menu", "[role='navigation']", "select", "textarea", "menu", "[role='navigation']",
"[role='banner']", "[role='complementary']"]): "[role='banner']", "[role='complementary']"]):
tag.decompose() tag.decompose()
# Try to find main content area # Find main content
main = soup.select_one( main = soup.select_one(
"main, article, [role='main'], .post-content, .entry-content, " "main, article, [role='main'], .post-content, .entry-content, "
".article-body, .article-content, #content, .content, " ".article-body, .article-content, #content, .content, "
...@@ -605,15 +335,15 @@ async def _fetch_pages(results: list[dict]) -> list[dict]: ...@@ -605,15 +335,15 @@ async def _fetch_pages(results: list[dict]) -> list[dict]:
text = (main or soup.body or soup).get_text(separator="\n", strip=True) text = (main or soup.body or soup).get_text(separator="\n", strip=True)
# Clean up — remove very short lines (navigation remnants) # Clean up nav remnants
lines = [l.strip() for l in text.split("\n") if l.strip() and len(l.strip()) > 15] lines = [l.strip() for l in text.split("\n") if l.strip() and len(l.strip()) > 15]
text = "\n".join(lines) text = "\n".join(lines)
if len(text) > 200: if len(text) > 200:
return {"title": r["title"], "url": r["url"], "content": text} return {"title": r["title"], "url": url, "content": text}
except Exception as e: except Exception as e:
logger.debug(f"Failed to fetch {r.get('url', '?')}: {e}") logger.debug(f"Failed to fetch {url}: {e}")
return None return None
......
...@@ -12,5 +12,4 @@ pydantic==2.10.4 ...@@ -12,5 +12,4 @@ pydantic==2.10.4
Pillow==11.1.0 Pillow==11.1.0
beautifulsoup4==4.12.3 beautifulsoup4==4.12.3
python-pptx==1.0.2 python-pptx==1.0.2
python-docx==1.1.2 python-docx==1.1.2
duckduckgo-search>=7.0.0 \ No newline at end of file
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment