Commit c4090250 authored by Administrator's avatar Administrator

Update 2 files via Son of Anton

parent 5f1d3ebc
""" """
Web Search Service — DuckDuckGo HTML, zero API keys required. Web Search Service — Multi-strategy with robust fallbacks.
Primary: duckduckgo-search library (handles anti-bot internally)
Fallback 1: DDG HTML scraping with anti-detection
Fallback 2: DDG Lite scraping
""" """
import re import re
import asyncio import asyncio
import random
import logging
from urllib.parse import quote_plus, urlparse, parse_qs from urllib.parse import quote_plus, urlparse, parse_qs
from concurrent.futures import ThreadPoolExecutor
import httpx import httpx
_UA = ( logger = logging.getLogger("son_of_anton.web_search")
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
)
_HEADERS = {"User-Agent": _UA}
# ═══════════════════════════════════════════════════
# duckduckgo-search library (primary)
# ═══════════════════════════════════════════════════
try:
from duckduckgo_search import DDGS
HAS_DDGS = True
logger.info("duckduckgo-search library available — using as primary search method")
except ImportError:
HAS_DDGS = False
logger.warning("duckduckgo-search library NOT installed — falling back to HTML scraping")
_executor = ThreadPoolExecutor(max_workers=2)
_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
]
def _get_headers():
return {
"User-Agent": random.choice(_USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
}
# ═══════════════════════════════════════════════════
# Main entry point
# ═══════════════════════════════════════════════════
async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> str: async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> str:
"""
Search the web using multiple strategies with automatic fallback.
Returns formatted search results as a string for the LLM context.
"""
logger.info(f"Web search initiated: query='{query[:80]}', num_results={num_results}")
results = []
# Strategy 1: duckduckgo-search library (most reliable)
if HAS_DDGS:
results = await _ddgs_library_search(query, num_results)
if results:
logger.info(f"DDGS library returned {len(results)} results")
# Strategy 2: DDG HTML scraping
if not results:
logger.info("DDGS library failed or unavailable, trying HTML scraping...")
results = await _ddg_html_search(query, num_results) results = await _ddg_html_search(query, num_results)
if results:
logger.info(f"DDG HTML scraping returned {len(results)} results")
# Strategy 3: DDG Lite
if not results: if not results:
logger.info("HTML scraping failed, trying DDG Lite...")
results = await _ddg_lite_search(query, num_results) results = await _ddg_lite_search(query, num_results)
if results:
logger.info(f"DDG Lite returned {len(results)} results")
# Strategy 4: Brave Search (no API key needed for basic)
if not results:
logger.info("DDG Lite failed, trying Brave Search...")
results = await _brave_html_search(query, num_results)
if results:
logger.info(f"Brave HTML search returned {len(results)} results")
if not results: if not results:
return f"[Web search for '{query}' returned no results. Answer from your own knowledge.]" logger.warning(f"ALL search strategies failed for query: '{query[:80]}'")
return f"[Web search for '{query}' returned no results. All search strategies exhausted. Answer from your own knowledge.]"
# Format results
lines = [ lines = [
"═" * 60, "═" * 60,
"WEB SEARCH RESULTS", "WEB SEARCH RESULTS",
...@@ -37,6 +116,7 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> ...@@ -37,6 +116,7 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) ->
lines.append(f" {r['snippet']}") lines.append(f" {r['snippet']}")
lines.append("") lines.append("")
# Fetch full page content for top results
if fetch_pages > 0: if fetch_pages > 0:
detailed = await _fetch_pages(results[:fetch_pages]) detailed = await _fetch_pages(results[:fetch_pages])
if detailed: if detailed:
...@@ -52,93 +132,332 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> ...@@ -52,93 +132,332 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) ->
return "\n".join(lines) return "\n".join(lines)
# ═══════════════════════════════════════════════════
# Strategy 1: duckduckgo-search library
# ═══════════════════════════════════════════════════
def _ddgs_sync_search(query: str, n: int) -> list[dict]:
"""Synchronous DDGS search — runs in thread executor."""
try:
with DDGS() as ddgs:
raw = list(ddgs.text(query, max_results=n, region="wt-wt", safesearch="off"))
results = []
for r in raw:
title = r.get("title", "").strip()
url = r.get("href", r.get("link", "")).strip()
snippet = r.get("body", r.get("snippet", "")).strip()
if title and url and url.startswith("http"):
results.append({"title": title, "url": url, "snippet": snippet})
return results
except Exception as e:
logger.error(f"DDGS library search error: {type(e).__name__}: {e}")
return []
async def _ddgs_library_search(query: str, n: int) -> list[dict]:
"""Run the synchronous DDGS search in a thread pool."""
if not HAS_DDGS:
return []
try:
loop = asyncio.get_event_loop()
return await loop.run_in_executor(_executor, _ddgs_sync_search, query, n)
except Exception as e:
logger.error(f"DDGS executor error: {e}")
return []
# ═══════════════════════════════════════════════════
# Strategy 2: DDG HTML scraping (improved)
# ═══════════════════════════════════════════════════
async def _ddg_html_search(query: str, n: int) -> list[dict]: async def _ddg_html_search(query: str, n: int) -> list[dict]:
try: try:
url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}" headers = _get_headers()
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: headers["Referer"] = "https://duckduckgo.com/"
headers["Origin"] = "https://html.duckduckgo.com"
headers["Content-Type"] = "application/x-www-form-urlencoded"
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
http2=False,
) as client:
# First, hit the main page to get cookies
try:
await client.get("https://html.duckduckgo.com/html/", headers=_get_headers())
except Exception:
pass
# Small random delay to appear more human
await asyncio.sleep(random.uniform(0.3, 1.0))
# Now do the actual search
resp = await client.post( resp = await client.post(
"https://html.duckduckgo.com/html/", "https://html.duckduckgo.com/html/",
data={"q": query, "b": ""}, data={"q": query, "b": "", "kl": ""},
headers={**_HEADERS, "Content-Type": "application/x-www-form-urlencoded"}, headers=headers,
) )
if resp.status_code != 200: if resp.status_code != 200:
logger.warning(f"DDG HTML returned status {resp.status_code}")
return []
html = resp.text
# Debug: check if we got a CAPTCHA or empty page
if len(html) < 500:
logger.warning(f"DDG HTML response suspiciously short: {len(html)} chars")
return []
if "Please try again" in html or "bot" in html.lower()[:500]:
logger.warning("DDG HTML appears to have returned a bot-detection page")
return [] return []
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(html, "html.parser")
results = [] results = []
for el in soup.select(".result__body, .result"):
title_a = el.select_one(".result__title a, .result__a") # Try multiple selector strategies
snippet_el = el.select_one(".result__snippet") selectors = [
(".result__body", ".result__title a, .result__a", ".result__snippet"),
(".result", ".result__a", ".result__snippet"),
(".web-result", ".result__a", ".result__snippet"),
(".results .result", "a.result__a", ".result__snippet"),
]
for container_sel, link_sel, snippet_sel in selectors:
elements = soup.select(container_sel)
if not elements:
continue
for el in elements:
title_a = el.select_one(link_sel)
snippet_el = el.select_one(snippet_sel)
if not title_a: if not title_a:
continue continue
title = title_a.get_text(strip=True) title = title_a.get_text(strip=True)
href = title_a.get("href", "") href = title_a.get("href", "")
snippet = snippet_el.get_text(strip=True) if snippet_el else "" snippet = snippet_el.get_text(strip=True) if snippet_el else ""
# DDG wraps URLs in a redirect
if "uddg=" in href: if "uddg=" in href:
parsed = parse_qs(urlparse(href).query) parsed = parse_qs(urlparse(href).query)
href = parsed.get("uddg", [href])[0] href = parsed.get("uddg", [href])[0]
elif href.startswith("//duckduckgo.com/l/?"):
parsed = parse_qs(urlparse(href).query)
href = parsed.get("uddg", parsed.get("u", [href]))[0]
if title and href and href.startswith("http"): if title and href and href.startswith("http"):
results.append({"title": title, "url": href, "snippet": snippet}) results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n:
break
if results:
break # Stop trying selectors if we found results
if not results:
# Last resort: find ANY links that look like search results
for a_tag in soup.find_all("a", href=True):
href = a_tag.get("href", "")
text = a_tag.get_text(strip=True)
if "uddg=" in href:
parsed = parse_qs(urlparse(href).query)
real_url = parsed.get("uddg", [""])[0]
if real_url and real_url.startswith("http") and text and len(text) > 5:
results.append({"title": text, "url": real_url, "snippet": ""})
if len(results) >= n: if len(results) >= n:
break break
logger.info(f"DDG HTML parsed {len(results)} results from {len(html)} chars of HTML")
return results return results
except Exception as e: except Exception as e:
print(f" DDG HTML search error: {e}") logger.error(f"DDG HTML search error: {type(e).__name__}: {e}")
return [] return []
# ═══════════════════════════════════════════════════
# Strategy 3: DDG Lite
# ═══════════════════════════════════════════════════
async def _ddg_lite_search(query: str, n: int) -> list[dict]: async def _ddg_lite_search(query: str, n: int) -> list[dict]:
try: try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: async with httpx.AsyncClient(
resp = await client.get( timeout=20.0,
follow_redirects=True,
) as client:
resp = await client.post(
"https://lite.duckduckgo.com/lite/", "https://lite.duckduckgo.com/lite/",
params={"q": query}, data={"q": query},
headers=_HEADERS, headers=_get_headers(),
) )
if resp.status_code != 200: if resp.status_code != 200:
logger.warning(f"DDG Lite returned status {resp.status_code}")
return [] return []
html = resp.text
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(html, "html.parser")
results = [] results = []
for a_tag in soup.select("a.result-link, td a[href^='http']"):
# DDG Lite uses a table-based layout
# Results are in <a class="result-link"> or just regular <a> tags in result rows
for a_tag in soup.select("a.result-link"):
href = a_tag.get("href", "") href = a_tag.get("href", "")
title = a_tag.get_text(strip=True) title = a_tag.get_text(strip=True)
if href.startswith("http") and title and "duckduckgo" not in href: if href.startswith("http") and title and "duckduckgo" not in href.lower():
snippet_td = a_tag.find_parent("tr") # Try to find snippet in next table row
snippet = "" snippet = ""
if snippet_td: parent_tr = a_tag.find_parent("tr")
next_tr = snippet_td.find_next_sibling("tr") if parent_tr:
next_tr = parent_tr.find_next_sibling("tr")
if next_tr: if next_tr:
snippet_td = next_tr.find("td", class_="result-snippet")
if snippet_td:
snippet = snippet_td.get_text(strip=True)[:300]
elif next_tr:
snippet = next_tr.get_text(strip=True)[:300] snippet = next_tr.get_text(strip=True)[:300]
results.append({"title": title, "url": href, "snippet": snippet}) results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n: if len(results) >= n:
break break
# Fallback: any td a[href^='http']
if not results:
for a_tag in soup.select("td a[href^='http']"):
href = a_tag.get("href", "")
title = a_tag.get_text(strip=True)
if title and "duckduckgo" not in href.lower() and len(title) > 5:
results.append({"title": title, "url": href, "snippet": ""})
if len(results) >= n:
break
return results return results
except Exception:
except Exception as e:
logger.error(f"DDG Lite search error: {type(e).__name__}: {e}")
return [] return []
# ═══════════════════════════════════════════════════
# Strategy 4: Brave Search HTML scraping
# ═══════════════════════════════════════════════════
async def _brave_html_search(query: str, n: int) -> list[dict]:
"""Scrape Brave Search as a last resort."""
try:
headers = _get_headers()
headers["Referer"] = "https://search.brave.com/"
async with httpx.AsyncClient(
timeout=20.0,
follow_redirects=True,
) as client:
resp = await client.get(
"https://search.brave.com/search",
params={"q": query, "source": "web"},
headers=headers,
)
if resp.status_code != 200:
logger.warning(f"Brave Search returned status {resp.status_code}")
return []
html = resp.text
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
results = []
# Brave uses various container classes
for item in soup.select(".snippet, [data-type='web']"):
title_el = item.select_one(".snippet-title, .title, a[href^='http']")
desc_el = item.select_one(".snippet-description, .snippet-content, .description")
if not title_el:
continue
title = title_el.get_text(strip=True)
href = title_el.get("href", "")
# Find the actual URL
if not href.startswith("http"):
link = item.select_one("a[href^='http']")
if link:
href = link.get("href", "")
snippet = desc_el.get_text(strip=True) if desc_el else ""
if title and href and href.startswith("http") and "brave.com" not in href:
results.append({"title": title, "url": href, "snippet": snippet})
if len(results) >= n:
break
logger.info(f"Brave Search parsed {len(results)} results")
return results
except Exception as e:
logger.error(f"Brave HTML search error: {type(e).__name__}: {e}")
return []
# ═══════════════════════════════════════════════════
# Page content fetcher
# ═══════════════════════════════════════════════════
async def _fetch_pages(results: list[dict]) -> list[dict]: async def _fetch_pages(results: list[dict]) -> list[dict]:
"""Fetch and extract text content from result URLs."""
async def _fetch_one(r: dict) -> dict | None: async def _fetch_one(r: dict) -> dict | None:
try: try:
async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client: async with httpx.AsyncClient(
resp = await client.get(r["url"], headers=_HEADERS) timeout=12.0,
if resp.status_code != 200 or "text/html" not in resp.headers.get("content-type", ""): follow_redirects=True,
http2=False,
) as client:
resp = await client.get(r["url"], headers=_get_headers())
if resp.status_code != 200:
return None
content_type = resp.headers.get("content-type", "")
if "text/html" not in content_type:
return None return None
html = resp.text
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "form", "noscript", "svg", "iframe"]):
# Remove non-content elements
for tag in soup(["script", "style", "nav", "footer", "header", "aside",
"form", "noscript", "svg", "iframe", "button", "input",
"select", "textarea", "menu", "[role='navigation']",
"[role='banner']", "[role='complementary']"]):
tag.decompose() tag.decompose()
main = soup.select_one("main, article, [role='main'], .post-content, .entry-content, .article-body, #content")
# Try to find main content area
main = soup.select_one(
"main, article, [role='main'], .post-content, .entry-content, "
".article-body, .article-content, #content, .content, "
".post-body, .story-body, .page-content, #main-content"
)
text = (main or soup.body or soup).get_text(separator="\n", strip=True) text = (main or soup.body or soup).get_text(separator="\n", strip=True)
lines = [l.strip() for l in text.split("\n") if l.strip() and len(l.strip()) > 20]
# Clean up — remove very short lines (navigation remnants)
lines = [l.strip() for l in text.split("\n") if l.strip() and len(l.strip()) > 15]
text = "\n".join(lines) text = "\n".join(lines)
if len(text) > 200: if len(text) > 200:
return {"title": r["title"], "url": r["url"], "content": text} return {"title": r["title"], "url": r["url"], "content": text}
except Exception:
pass except Exception as e:
logger.debug(f"Failed to fetch {r.get('url', '?')}: {e}")
return None return None
tasks = [_fetch_one(r) for r in results] tasks = [_fetch_one(r) for r in results]
......
...@@ -13,3 +13,4 @@ Pillow==11.1.0 ...@@ -13,3 +13,4 @@ Pillow==11.1.0
beautifulsoup4==4.12.3 beautifulsoup4==4.12.3
python-pptx==1.0.2 python-pptx==1.0.2
python-docx==1.1.2 python-docx==1.1.2
duckduckgo-search>=7.0.0
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment