Commit 57750e01 authored by Administrator's avatar Administrator

Update 4 files via Son of Anton

parent 6774dcff
......@@ -37,4 +37,10 @@ BEDROCK_ENDPOINT: str = (
f"https://bedrock-runtime.{AWS_REGION}.amazonaws.com"
)
APP_VERSION: str = "4.0.0"
\ No newline at end of file
# SerpAPI for web search
SERPAPI_KEY: str = os.getenv(
"SERPAPI_KEY",
"0f9efa98fb0fe7b27af609e8dd80e04c4af1e098ec81fe628a6d63aaaebe8bd6",
)
APP_VERSION: str = "4.2.0"
\ No newline at end of file
"""
Web Search Service — v4.1.1 — Robust multi-strategy with proper region handling.
Primary: duckduckgo-search library (forced US-English)
Fallback 1: DDG HTML scraping with anti-detection
Fallback 2: DDG Lite scraping
Fallback 3: Brave Search HTML scraping
Web Search Service — v4.2.0 — SerpAPI (Google Search)
Clean. Reliable. No scraping bullshit.
"""
import re
import asyncio
import random
import logging
from urllib.parse import quote_plus, urlparse, parse_qs
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlencode
import httpx
from backend.config import SERPAPI_KEY
logger = logging.getLogger("son_of_anton.web_search")
# ═══════════════════════════════════════════════════
# duckduckgo-search library (primary)
# SerpAPI Configuration
# ═══════════════════════════════════════════════════
try:
from duckduckgo_search import DDGS
HAS_DDGS = True
logger.info("duckduckgo-search library available — using as primary search method")
except ImportError:
HAS_DDGS = False
logger.warning("duckduckgo-search library NOT installed — falling back to HTML scraping")
_executor = ThreadPoolExecutor(max_workers=2)
SERPAPI_BASE = "https://serpapi.com/search"
_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
]
# Domains that are NEVER useful in English search results
# Domains we never want cluttering results
BLOCKED_DOMAINS = {
"baidu.com", "zhihu.com", "csdn.net", "bilibili.com", "weibo.com",
"sogou.com", "163.com", "qq.com", "taobao.com", "jd.com",
"douyin.com", "tiktok.com", "yandex.ru", "mail.ru",
"duckduckgo.com", "search.brave.com", "google.com/search",
}
def _get_headers():
return {
"User-Agent": random.choice(_USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
}
def _is_valid_result(url: str, title: str) -> bool:
"""Filter out garbage results — non-English, ad domains, empty junk."""
"""Filter garbage results."""
if not url or not url.startswith("http"):
return False
if not title or len(title.strip()) < 3:
return False
try:
parsed = urlparse(url)
domain = parsed.netloc.lower().lstrip("www.")
from urllib.parse import urlparse
domain = urlparse(url).netloc.lower().lstrip("www.")
for blocked in BLOCKED_DOMAINS:
if domain == blocked or domain.endswith("." + blocked):
return False
except Exception:
return False
# Block known garbage domains
for blocked in BLOCKED_DOMAINS:
if domain == blocked or domain.endswith("." + blocked):
return False
# Block results that are clearly search engine pages, not actual results
if "/search?" in url or "/search/" in url:
return False
pass
return True
# ═══════════════════════════════════════════════════
# Main entry point
# Main Entry Point
# ═══════════════════════════════════════════════════
async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> str:
"""
Search the web using multiple strategies with automatic fallback.
Returns formatted search results as a string for the LLM context.
Search the web via SerpAPI (Google).
Returns formatted search results string for LLM context.
"""
logger.info(f"Web search initiated: query='{query[:80]}', num_results={num_results}")
results = []
# Strategy 1: duckduckgo-search library (most reliable)
if HAS_DDGS:
results = await _ddgs_library_search(query, num_results)
if results:
logger.info(f"DDGS library returned {len(results)} results")
# Strategy 2: DDG HTML scraping
if not results:
logger.info("DDGS library failed or unavailable, trying HTML scraping...")
results = await _ddg_html_search(query, num_results)
if results:
logger.info(f"DDG HTML scraping returned {len(results)} results")
# Strategy 3: DDG Lite
if not results:
logger.info("HTML scraping failed, trying DDG Lite...")
results = await _ddg_lite_search(query, num_results)
if results:
logger.info(f"DDG Lite returned {len(results)} results")
if not SERPAPI_KEY:
logger.error("SERPAPI_KEY not configured!")
return f"[Web search failed: SerpAPI key not configured. Answer from your own knowledge.]"
# Strategy 4: Brave Search
if not results:
logger.info("DDG Lite failed, trying Brave Search...")
results = await _brave_html_search(query, num_results)
if results:
logger.info(f"Brave HTML search returned {len(results)} results")
results = await _serpapi_search(query, num_results)
# Strategy 5: Google scraping (last resort)
if not results:
logger.info("Brave failed, trying Google scraping...")
results = await _google_html_search(query, num_results)
if results:
logger.info(f"Google HTML search returned {len(results)} results")
logger.warning(f"SerpAPI returned no results for: '{query[:80]}'")
return f"[Web search for '{query}' returned no results. Answer from your own knowledge.]"
if not results:
logger.warning(f"ALL search strategies failed for query: '{query[:80]}'")
return f"[Web search for '{query}' returned no results. All search strategies exhausted. Answer from your own knowledge.]"
logger.info(f"SerpAPI returned {len(results)} results for: '{query[:60]}'")
# Format results
lines = [
"═" * 60,
"WEB SEARCH RESULTS",
"WEB SEARCH RESULTS (via Google/SerpAPI)",
f"Query: {query}",
f"Results: {len(results)}",
"═" * 60, "",
]
for i, r in enumerate(results, 1):
lines.append(f"[{i}] {r['title']}")
lines.append(f" URL: {r['url']}")
if r.get("url"):
lines.append(f" URL: {r['url']}")
if r.get("snippet"):
lines.append(f" {r['snippet']}")
if r.get("date"):
lines.append(f" Date: {r['date']}")
lines.append("")
# Fetch full page content for top results
if fetch_pages > 0:
detailed = await _fetch_pages(results[:fetch_pages])
# Only fetch actual URLs (skip knowledge graph entries without URLs)
fetchable = [r for r in results if r.get("url") and r["url"].startswith("http")]
detailed = await _fetch_pages(fetchable[:fetch_pages])
if detailed:
lines.append("═" * 60)
lines.append("DETAILED PAGE CONTENT:")
......@@ -174,408 +109,203 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) ->
# ═══════════════════════════════════════════════════
# Strategy 1: duckduckgo-search library
# SerpAPI Google Search
# ═══════════════════════════════════════════════════
def _ddgs_sync_search(query: str, n: int) -> list[dict]:
"""Synchronous DDGS search — runs in thread executor."""
results = []
# Try multiple configurations in order
configs = [
{"region": "us-en", "backend": "auto"},
{"region": "us-en", "backend": "html"},
{"region": "us-en", "backend": "lite"},
{"region": "wt-wt", "backend": "auto"},
]
for cfg in configs:
if results:
break
try:
with DDGS() as ddgs:
kwargs = {
"keywords": query,
"max_results": n + 5, # Request extra, filter later
"region": cfg["region"],
"safesearch": "moderate",
}
# backend parameter exists in newer versions
try:
kwargs["backend"] = cfg["backend"]
raw = list(ddgs.text(**kwargs))
except TypeError:
# Older version without backend parameter
del kwargs["backend"]
raw = list(ddgs.text(**kwargs))
for r in raw:
title = r.get("title", "").strip()
url = r.get("href", r.get("link", "")).strip()
snippet = r.get("body", r.get("snippet", "")).strip()
if _is_valid_result(url, title):
results.append({"title": title, "url": url, "snippet": snippet})
if len(results) >= n:
break
if results:
logger.info(f"DDGS config {cfg} returned {len(results)} valid results")
return results
except Exception as e:
logger.warning(f"DDGS config {cfg} failed: {type(e).__name__}: {e}")
continue
return results
async def _ddgs_library_search(query: str, n: int) -> list[dict]:
"""Run the synchronous DDGS search in a thread pool."""
if not HAS_DDGS:
return []
async def _serpapi_search(query: str, n: int) -> list[dict]:
"""Hit SerpAPI's Google Search endpoint and parse results."""
try:
loop = asyncio.get_event_loop()
return await asyncio.wait_for(
loop.run_in_executor(_executor, _ddgs_sync_search, query, n),
timeout=30.0,
)
except asyncio.TimeoutError:
logger.error("DDGS library search timed out")
return []
except Exception as e:
logger.error(f"DDGS executor error: {e}")
return []
# ═══════════════════════════════════════════════════
# Strategy 2: DDG HTML scraping (improved)
# ═══════════════════════════════════════════════════
async def _ddg_html_search(query: str, n: int) -> list[dict]:
try:
headers = _get_headers()
headers["Referer"] = "https://duckduckgo.com/"
headers["Origin"] = "https://html.duckduckgo.com"
headers["Content-Type"] = "application/x-www-form-urlencoded"
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
http2=False,
) as client:
# First, hit the main page to get cookies
try:
await client.get("https://html.duckduckgo.com/html/", headers=_get_headers())
except Exception:
pass
await asyncio.sleep(random.uniform(0.3, 1.0))
resp = await client.post(
"https://html.duckduckgo.com/html/",
data={"q": query, "b": "", "kl": "us-en", "kp": "-1"},
headers=headers,
)
params = {
"q": query,
"api_key": SERPAPI_KEY,
"engine": "google",
"num": min(n + 5, 20),
"hl": "en",
"gl": "us",
"safe": "off",
"no_cache": "false",
}
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.get(SERPAPI_BASE, params=params)
if resp.status_code != 200:
logger.warning(f"DDG HTML returned status {resp.status_code}")
error_text = resp.text[:500]
logger.error(f"SerpAPI HTTP {resp.status_code}: {error_text}")
try:
err_data = resp.json()
logger.error(f"SerpAPI error: {err_data.get('error', 'unknown')}")
except Exception:
pass
return []
html = resp.text
data = resp.json()
if len(html) < 500:
logger.warning(f"DDG HTML response suspiciously short: {len(html)} chars")
return []
if "Please try again" in html or "bot" in html.lower()[:500]:
logger.warning("DDG HTML appears to have returned a bot-detection page")
return []
if "error" in data:
logger.error(f"SerpAPI error: {data['error']}")
return []
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
results = []
# Try multiple selector strategies
selectors = [
(".result__body", ".result__title a, .result__a", ".result__snippet"),
(".result", ".result__a", ".result__snippet"),
(".web-result", ".result__a", ".result__snippet"),
(".results .result", "a.result__a", ".result__snippet"),
]
for container_sel, link_sel, snippet_sel in selectors:
elements = soup.select(container_sel)
if not elements:
continue
for el in elements:
title_a = el.select_one(link_sel)
snippet_el = el.select_one(snippet_sel)
if not title_a:
continue
title = title_a.get_text(strip=True)
href = title_a.get("href", "")
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
# DDG wraps URLs in a redirect
if "uddg=" in href:
parsed = parse_qs(urlparse(href).query)
href = parsed.get("uddg", [href])[0]
elif href.startswith("//duckduckgo.com/l/?"):
parsed = parse_qs(urlparse(href).query)
href = parsed.get("uddg", parsed.get("u", [href]))[0]
if _is_valid_result(href, title):
results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n:
break
if results:
break
# Last resort: find ANY links that look like search results
if not results:
for a_tag in soup.find_all("a", href=True):
href = a_tag.get("href", "")
text = a_tag.get_text(strip=True)
if "uddg=" in href:
parsed = parse_qs(urlparse(href).query)
real_url = parsed.get("uddg", [""])[0]
if _is_valid_result(real_url, text):
results.append({"title": text, "url": real_url, "snippet": ""})
if len(results) >= n:
break
logger.info(f"DDG HTML parsed {len(results)} results from {len(html)} chars of HTML")
return results
except Exception as e:
logger.error(f"DDG HTML search error: {type(e).__name__}: {e}")
return []
# ═══════════════════════════════════════════════════
# Strategy 3: DDG Lite
# ═══════════════════════════════════════════════════
async def _ddg_lite_search(query: str, n: int) -> list[dict]:
try:
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
) as client:
resp = await client.post(
"https://lite.duckduckgo.com/lite/",
data={"q": query, "kl": "us-en"},
headers=_get_headers(),
# ── Answer box / Featured snippet (highest priority) ──
answer = data.get("answer_box", {})
if answer:
answer_text = (
answer.get("answer", "")
or answer.get("snippet", "")
or answer.get("result", "")
)
if resp.status_code != 200:
logger.warning(f"DDG Lite returned status {resp.status_code}")
return []
html = resp.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
results = []
for a_tag in soup.select("a.result-link"):
href = a_tag.get("href", "")
title = a_tag.get_text(strip=True)
if not _is_valid_result(href, title):
continue
snippet = ""
parent_tr = a_tag.find_parent("tr")
if parent_tr:
next_tr = parent_tr.find_next_sibling("tr")
if next_tr:
snippet_td = next_tr.find("td", class_="result-snippet")
if snippet_td:
snippet = snippet_td.get_text(strip=True)[:300]
elif next_tr:
snippet = next_tr.get_text(strip=True)[:300]
results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n:
if answer_text:
results.append({
"title": f"[Featured Answer] {answer.get('title', query)}",
"url": answer.get("link", ""),
"snippet": str(answer_text)[:500],
"date": "",
})
# ── Knowledge graph ──
kg = data.get("knowledge_graph", {})
if kg and kg.get("title"):
kg_parts = []
if kg.get("description"):
kg_parts.append(kg["description"])
if kg.get("type"):
kg_parts.append(f"Type: {kg['type']}")
for key in ["born", "died", "founded", "headquarters", "ceo",
"revenue", "employees", "website", "nationality",
"genre", "awards", "education", "height", "weight",
"capital", "population", "area", "currency",
"president", "prime_minister"]:
val = kg.get(key)
if val:
kg_parts.append(f"{key.replace('_', ' ').title()}: {val}")
# Also grab "known attributes" list if present
for attr in kg.get("attributes", {}).items():
if len(kg_parts) < 15:
kg_parts.append(f"{attr[0]}: {attr[1]}")
if kg_parts:
results.append({
"title": f"[Knowledge Graph] {kg['title']}",
"url": kg.get("website", kg.get("source", {}).get("link", "")),
"snippet": " | ".join(kg_parts),
"date": "",
})
# ── Organic results (main search results) ──
for item in data.get("organic_results", []):
title = item.get("title", "").strip()
url = item.get("link", "").strip()
snippet = item.get("snippet", "").strip()
date = item.get("date", "")
# Rich snippet extras
rich_snippet = item.get("rich_snippet", {})
if rich_snippet:
top = rich_snippet.get("top", {})
if top.get("detected_extensions", {}).get("rating"):
snippet += f" | Rating: {top['detected_extensions']['rating']}"
if _is_valid_result(url, title):
results.append({
"title": title,
"url": url,
"snippet": snippet,
"date": date,
})
if len(results) >= n + 3: # +3 for KG/answer/news
break
# Fallback: any td a[href^='http']
if not results:
for a_tag in soup.select("td a[href^='http']"):
href = a_tag.get("href", "")
title = a_tag.get_text(strip=True)
if _is_valid_result(href, title) and len(title) > 5:
results.append({"title": title, "url": href, "snippet": ""})
if len(results) >= n:
break
return results
except Exception as e:
logger.error(f"DDG Lite search error: {type(e).__name__}: {e}")
return []
# ═══════════════════════════════════════════════════
# Strategy 4: Brave Search HTML scraping
# ═══════════════════════════════════════════════════
async def _brave_html_search(query: str, n: int) -> list[dict]:
"""Scrape Brave Search as a fallback."""
try:
headers = _get_headers()
headers["Referer"] = "https://search.brave.com/"
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
) as client:
resp = await client.get(
"https://search.brave.com/search",
params={"q": query, "source": "web"},
headers=headers,
)
if resp.status_code != 200:
logger.warning(f"Brave Search returned status {resp.status_code}")
return []
html = resp.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
results = []
for item in soup.select(".snippet, [data-type='web']"):
title_el = item.select_one(".snippet-title, .title, a[href^='http']")
desc_el = item.select_one(".snippet-description, .snippet-content, .description")
if not title_el:
continue
title = title_el.get_text(strip=True)
href = title_el.get("href", "")
if not href.startswith("http"):
link = item.select_one("a[href^='http']")
if link:
href = link.get("href", "")
snippet = desc_el.get_text(strip=True) if desc_el else ""
# ── Top stories (news) ──
for story in data.get("top_stories", [])[:3]:
title = story.get("title", "").strip()
url = story.get("link", "").strip()
source = story.get("source", "")
date = story.get("date", "")
if url and title:
results.append({
"title": f"[News] {title}",
"url": url,
"snippet": f"Source: {source}" + (f" | {date}" if date else ""),
"date": date,
})
# ── Related questions (People Also Ask) ──
paa = data.get("related_questions", [])
if paa:
paa_lines = []
for q in paa[:4]:
question = q.get("question", "")
paa_snippet = q.get("snippet", "")
if question:
entry = question
if paa_snippet:
entry += f" → {paa_snippet[:150]}"
paa_lines.append(entry)
if paa_lines:
results.append({
"title": "[People Also Ask]",
"url": "",
"snippet": " | ".join(paa_lines),
"date": "",
})
# ── Related searches ──
related = data.get("related_searches", [])
if related:
related_queries = [r.get("query", "") for r in related[:5] if r.get("query")]
if related_queries:
results.append({
"title": "[Related Searches]",
"url": "",
"snippet": " | ".join(related_queries),
"date": "",
})
logger.info(f"SerpAPI: {len(data.get('organic_results', []))} organic, "
f"KG={'yes' if kg.get('title') else 'no'}, "
f"answer={'yes' if answer else 'no'}, "
f"news={len(data.get('top_stories', []))}, "
f"total={len(results)}")
if _is_valid_result(href, title):
results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n:
break
logger.info(f"Brave Search parsed {len(results)} results")
return results
except Exception as e:
logger.error(f"Brave HTML search error: {type(e).__name__}: {e}")
except httpx.TimeoutException:
logger.error("SerpAPI request timed out")
return []
# ═══════════════════════════════════════════════════
# Strategy 5: Google HTML scraping (last resort)
# ═══════════════════════════════════════════════════
async def _google_html_search(query: str, n: int) -> list[dict]:
"""Scrape Google search results as absolute last resort."""
try:
headers = _get_headers()
headers["Referer"] = "https://www.google.com/"
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
) as client:
resp = await client.get(
"https://www.google.com/search",
params={"q": query, "hl": "en", "gl": "us", "num": str(n + 5)},
headers=headers,
)
if resp.status_code != 200:
logger.warning(f"Google returned status {resp.status_code}")
return []
html = resp.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
results = []
# Google search results are in divs with class 'g' or similar
for g_div in soup.select("div.g, div.tF2Cxc, div[data-sokoban-container]"):
# Find the link
link_el = g_div.select_one("a[href^='http']")
if not link_el:
continue
href = link_el.get("href", "")
# Clean Google redirect URLs
if href.startswith("/url?"):
parsed = parse_qs(urlparse(href).query)
href = parsed.get("q", parsed.get("url", [href]))[0]
# Find title
title_el = g_div.select_one("h3")
title = title_el.get_text(strip=True) if title_el else link_el.get_text(strip=True)
# Find snippet
snippet_el = g_div.select_one(".VwiC3b, .IsZvec, .s3v9rd, span.st")
snippet = snippet_el.get_text(strip=True) if snippet_el else ""
if _is_valid_result(href, title):
results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n:
break
# Fallback: just find any h3 > a patterns
if not results:
for h3 in soup.find_all("h3"):
parent_a = h3.find_parent("a")
if parent_a:
href = parent_a.get("href", "")
if href.startswith("/url?"):
parsed = parse_qs(urlparse(href).query)
href = parsed.get("q", parsed.get("url", [href]))[0]
title = h3.get_text(strip=True)
if _is_valid_result(href, title):
results.append({"title": title, "url": href, "snippet": ""})
if len(results) >= n:
break
logger.info(f"Google HTML parsed {len(results)} results")
return results
except Exception as e:
logger.error(f"Google HTML search error: {type(e).__name__}: {e}")
logger.error(f"SerpAPI search error: {type(e).__name__}: {e}")
return []
# ═══════════════════════════════════════════════════
# Page content fetcher
# Page Content Fetcher
# ═══════════════════════════════════════════════════
async def _fetch_pages(results: list[dict]) -> list[dict]:
"""Fetch and extract text content from result URLs."""
async def _fetch_one(r: dict) -> dict | None:
url = r.get("url", "")
if not url or not url.startswith("http"):
return None
try:
async with httpx.AsyncClient(
timeout=12.0,
follow_redirects=True,
http2=False,
) as client:
resp = await client.get(r["url"], headers=_get_headers())
headers = {
"User-Agent": _USER_AGENTS[0],
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
resp = await client.get(url, headers=headers)
if resp.status_code != 200:
return None
......@@ -589,14 +319,14 @@ async def _fetch_pages(results: list[dict]) -> list[dict]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# Remove non-content elements
# Remove noise
for tag in soup(["script", "style", "nav", "footer", "header", "aside",
"form", "noscript", "svg", "iframe", "button", "input",
"select", "textarea", "menu", "[role='navigation']",
"[role='banner']", "[role='complementary']"]):
tag.decompose()
# Try to find main content area
# Find main content
main = soup.select_one(
"main, article, [role='main'], .post-content, .entry-content, "
".article-body, .article-content, #content, .content, "
......@@ -605,15 +335,15 @@ async def _fetch_pages(results: list[dict]) -> list[dict]:
text = (main or soup.body or soup).get_text(separator="\n", strip=True)
# Clean up — remove very short lines (navigation remnants)
# Clean up nav remnants
lines = [l.strip() for l in text.split("\n") if l.strip() and len(l.strip()) > 15]
text = "\n".join(lines)
if len(text) > 200:
return {"title": r["title"], "url": r["url"], "content": text}
return {"title": r["title"], "url": url, "content": text}
except Exception as e:
logger.debug(f"Failed to fetch {r.get('url', '?')}: {e}")
logger.debug(f"Failed to fetch {url}: {e}")
return None
......
......@@ -12,5 +12,4 @@ pydantic==2.10.4
Pillow==11.1.0
beautifulsoup4==4.12.3
python-pptx==1.0.2
python-docx==1.1.2
duckduckgo-search>=7.0.0
\ No newline at end of file
python-docx==1.1.2
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment