Update 2 files via Son of Anton

c4090250 · Administrator · 5f1d3ebc · c4090250 · c4090250
Commit c4090250 authored Mar 30, 2026 by Administrator
Show whitespace changes
Inline Side-by-side

Showing with 371 additions and 51 deletions

web_search_service.py backend/services/web_search_service.py +369 -50

requirements.txt requirements.txt +2 -1

No files found.
--- a/backend/services/web_search_service.py
+++ b/backend/services/web_search_service.py
 """
-Web Search Service — DuckDuckGo HTML, zero API keys required.
+Web Search Service — Multi-strategy with robust fallbacks.
+Primary: duckduckgo-search library (handles anti-bot internally)
+Fallback 1: DDG HTML scraping with anti-detection
+Fallback 2: DDG Lite scraping
 """
 import re
 import asyncio
+import random
+import logging
 from urllib.parse import quote_plus, urlparse, parse_qs
+from concurrent.futures import ThreadPoolExecutor
 import httpx
-_UA = (
+logger = logging.getLogger("son_of_anton.web_search")
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-    "AppleWebKit/537.36 (KHTML, like Gecko) "
-    "Chrome/124.0.0.0 Safari/537.36"
-)
-_HEADERS = {"User-Agent": _UA}
+# ═══════════════════════════════════════════════════
+#  duckduckgo-search library (primary)
+# ═══════════════════════════════════════════════════
+try:
+    from duckduckgo_search import DDGS
+    HAS_DDGS = True
+    logger.info("duckduckgo-search library available — using as primary search method")
+except ImportError:
+    HAS_DDGS = False
+    logger.warning("duckduckgo-search library NOT installed — falling back to HTML scraping")
+_executor = ThreadPoolExecutor(max_workers=2)
+_USER_AGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15",
+]
+def _get_headers():
+    return {
+        "User-Agent": random.choice(_USER_AGENTS),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Accept-Encoding": "gzip, deflate, br",
+        "DNT": "1",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+        "Sec-Fetch-Dest": "document",
+        "Sec-Fetch-Mode": "navigate",
+        "Sec-Fetch-Site": "none",
+        "Sec-Fetch-User": "?1",
+        "Cache-Control": "max-age=0",
+    }
+# ═══════════════════════════════════════════════════
+#  Main entry point
+# ═══════════════════════════════════════════════════
 async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> str:
+    """
+    Search the web using multiple strategies with automatic fallback.
+    Returns formatted search results as a string for the LLM context.
+    """
+    logger.info(f"Web search initiated: query='{query[:80]}', num_results={num_results}")
+    results = []
+    # Strategy 1: duckduckgo-search library (most reliable)
+    if HAS_DDGS:
+        results = await _ddgs_library_search(query, num_results)
+        if results:
+            logger.info(f"DDGS library returned {len(results)} results")
+    # Strategy 2: DDG HTML scraping
+    if not results:
+        logger.info("DDGS library failed or unavailable, trying HTML scraping...")
        results = await _ddg_html_search(query, num_results)
+        if results:
+            logger.info(f"DDG HTML scraping returned {len(results)} results")
+    # Strategy 3: DDG Lite
    if not results:
+        logger.info("HTML scraping failed, trying DDG Lite...")
        results = await _ddg_lite_search(query, num_results)
+        if results:
+            logger.info(f"DDG Lite returned {len(results)} results")
+    # Strategy 4: Brave Search (no API key needed for basic)
+    if not results:
+        logger.info("DDG Lite failed, trying Brave Search...")
+        results = await _brave_html_search(query, num_results)
+        if results:
+            logger.info(f"Brave HTML search returned {len(results)} results")
    if not results:
-        return f"[Web search for '{query}' returned no results. Answer from your own knowledge.]"
+        logger.warning(f"ALL search strategies failed for query: '{query[:80]}'")
+        return f"[Web search for '{query}' returned no results. All search strategies exhausted. Answer from your own knowledge.]"
+    # Format results
    lines = [
        "═" * 60,
        "WEB SEARCH RESULTS",
@@ -37,6 +116,7 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) ->
            lines.append(f"    {r['snippet']}")
        lines.append("")
+    # Fetch full page content for top results
    if fetch_pages > 0:
        detailed = await _fetch_pages(results[:fetch_pages])
        if detailed:
@@ -52,93 +132,332 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) ->
    return "\n".join(lines)
+# ═══════════════════════════════════════════════════
+#  Strategy 1: duckduckgo-search library
+# ═══════════════════════════════════════════════════
+def _ddgs_sync_search(query: str, n: int) -> list[dict]:
+    """Synchronous DDGS search — runs in thread executor."""
+    try:
+        with DDGS() as ddgs:
+            raw = list(ddgs.text(query, max_results=n, region="wt-wt", safesearch="off"))
+        results = []
+        for r in raw:
+            title = r.get("title", "").strip()
+            url = r.get("href", r.get("link", "")).strip()
+            snippet = r.get("body", r.get("snippet", "")).strip()
+            if title and url and url.startswith("http"):
+                results.append({"title": title, "url": url, "snippet": snippet})
+        return results
+    except Exception as e:
+        logger.error(f"DDGS library search error: {type(e).__name__}: {e}")
+        return []
+async def _ddgs_library_search(query: str, n: int) -> list[dict]:
+    """Run the synchronous DDGS search in a thread pool."""
+    if not HAS_DDGS:
+        return []
+    try:
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(_executor, _ddgs_sync_search, query, n)
+    except Exception as e:
+        logger.error(f"DDGS executor error: {e}")
+        return []
+# ═══════════════════════════════════════════════════
+#  Strategy 2: DDG HTML scraping (improved)
+# ═══════════════════════════════════════════════════
 async def _ddg_html_search(query: str, n: int) -> list[dict]:
    try:
-        url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
+        headers = _get_headers()
-        async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
+        headers["Referer"] = "https://duckduckgo.com/"
+        headers["Origin"] = "https://html.duckduckgo.com"
+        headers["Content-Type"] = "application/x-www-form-urlencoded"
+        async with httpx.AsyncClient(
+            timeout=20.0,
+            follow_redirects=True,
+            http2=False,
+        ) as client:
+            # First, hit the main page to get cookies
+            try:
+                await client.get("https://html.duckduckgo.com/html/", headers=_get_headers())
+            except Exception:
+                pass
+            # Small random delay to appear more human
+            await asyncio.sleep(random.uniform(0.3, 1.0))
+            # Now do the actual search
            resp = await client.post(
                "https://html.duckduckgo.com/html/",
-                data={"q": query, "b": ""},
+                data={"q": query, "b": "", "kl": ""},
-                headers={**_HEADERS, "Content-Type": "application/x-www-form-urlencoded"},
+                headers=headers,
            )
            if resp.status_code != 200:
+                logger.warning(f"DDG HTML returned status {resp.status_code}")
+                return []
+            html = resp.text
+            # Debug: check if we got a CAPTCHA or empty page
+            if len(html) < 500:
+                logger.warning(f"DDG HTML response suspiciously short: {len(html)} chars")
+                return []
+            if "Please try again" in html or "bot" in html.lower()[:500]:
+                logger.warning("DDG HTML appears to have returned a bot-detection page")
                return []
        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(resp.text, "html.parser")
+        soup = BeautifulSoup(html, "html.parser")
        results = []
-        for el in soup.select(".result__body, .result"):
-            title_a = el.select_one(".result__title a, .result__a")
+        # Try multiple selector strategies
-            snippet_el = el.select_one(".result__snippet")
+        selectors = [
+            (".result__body", ".result__title a, .result__a", ".result__snippet"),
+            (".result", ".result__a", ".result__snippet"),
+            (".web-result", ".result__a", ".result__snippet"),
+            (".results .result", "a.result__a", ".result__snippet"),
+        ]
+        for container_sel, link_sel, snippet_sel in selectors:
+            elements = soup.select(container_sel)
+            if not elements:
+                continue
+            for el in elements:
+                title_a = el.select_one(link_sel)
+                snippet_el = el.select_one(snippet_sel)
                if not title_a:
                    continue
                title = title_a.get_text(strip=True)
                href = title_a.get("href", "")
                snippet = snippet_el.get_text(strip=True) if snippet_el else ""
+                # DDG wraps URLs in a redirect
                if "uddg=" in href:
                    parsed = parse_qs(urlparse(href).query)
                    href = parsed.get("uddg", [href])[0]
+                elif href.startswith("//duckduckgo.com/l/?"):
+                    parsed = parse_qs(urlparse(href).query)
+                    href = parsed.get("uddg", parsed.get("u", [href]))[0]
                if title and href and href.startswith("http"):
                    results.append({"title": title, "url": href, "snippet": snippet})
+                if len(results) >= n:
+                    break
+            if results:
+                break  # Stop trying selectors if we found results
+        if not results:
+            # Last resort: find ANY links that look like search results
+            for a_tag in soup.find_all("a", href=True):
+                href = a_tag.get("href", "")
+                text = a_tag.get_text(strip=True)
+                if "uddg=" in href:
+                    parsed = parse_qs(urlparse(href).query)
+                    real_url = parsed.get("uddg", [""])[0]
+                    if real_url and real_url.startswith("http") and text and len(text) > 5:
+                        results.append({"title": text, "url": real_url, "snippet": ""})
                if len(results) >= n:
                    break
+        logger.info(f"DDG HTML parsed {len(results)} results from {len(html)} chars of HTML")
        return results
    except Exception as e:
-        print(f"  DDG HTML search error: {e}")
+        logger.error(f"DDG HTML search error: {type(e).__name__}: {e}")
        return []
+# ═══════════════════════════════════════════════════
+#  Strategy 3: DDG Lite
+# ═══════════════════════════════════════════════════
 async def _ddg_lite_search(query: str, n: int) -> list[dict]:
    try:
-        async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
+        async with httpx.AsyncClient(
-            resp = await client.get(
+            timeout=20.0,
+            follow_redirects=True,
+        ) as client:
+            resp = await client.post(
                "https://lite.duckduckgo.com/lite/",
-                params={"q": query},
+                data={"q": query},
-                headers=_HEADERS,
+                headers=_get_headers(),
            )
            if resp.status_code != 200:
+                logger.warning(f"DDG Lite returned status {resp.status_code}")
                return []
+            html = resp.text
        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(resp.text, "html.parser")
+        soup = BeautifulSoup(html, "html.parser")
        results = []
-        for a_tag in soup.select("a.result-link, td a[href^='http']"):
+        # DDG Lite uses a table-based layout
+        # Results are in <a class="result-link"> or just regular <a> tags in result rows
+        for a_tag in soup.select("a.result-link"):
            href = a_tag.get("href", "")
            title = a_tag.get_text(strip=True)
-            if href.startswith("http") and title and "duckduckgo" not in href:
+            if href.startswith("http") and title and "duckduckgo" not in href.lower():
-                snippet_td = a_tag.find_parent("tr")
+                # Try to find snippet in next table row
                snippet = ""
-                if snippet_td:
+                parent_tr = a_tag.find_parent("tr")
-                    next_tr = snippet_td.find_next_sibling("tr")
+                if parent_tr:
+                    next_tr = parent_tr.find_next_sibling("tr")
                    if next_tr:
+                        snippet_td = next_tr.find("td", class_="result-snippet")
+                        if snippet_td:
+                            snippet = snippet_td.get_text(strip=True)[:300]
+                        elif next_tr:
                            snippet = next_tr.get_text(strip=True)[:300]
                results.append({"title": title, "url": href, "snippet": snippet})
            if len(results) >= n:
                break
+        # Fallback: any td a[href^='http']
+        if not results:
+            for a_tag in soup.select("td a[href^='http']"):
+                href = a_tag.get("href", "")
+                title = a_tag.get_text(strip=True)
+                if title and "duckduckgo" not in href.lower() and len(title) > 5:
+                    results.append({"title": title, "url": href, "snippet": ""})
+                if len(results) >= n:
+                    break
        return results
-    except Exception:
+    except Exception as e:
+        logger.error(f"DDG Lite search error: {type(e).__name__}: {e}")
        return []
+# ═══════════════════════════════════════════════════
+#  Strategy 4: Brave Search HTML scraping
+# ═══════════════════════════════════════════════════
+async def _brave_html_search(query: str, n: int) -> list[dict]:
+    """Scrape Brave Search as a last resort."""
+    try:
+        headers = _get_headers()
+        headers["Referer"] = "https://search.brave.com/"
+        async with httpx.AsyncClient(
+            timeout=20.0,
+            follow_redirects=True,
+        ) as client:
+            resp = await client.get(
+                "https://search.brave.com/search",
+                params={"q": query, "source": "web"},
+                headers=headers,
+            )
+            if resp.status_code != 200:
+                logger.warning(f"Brave Search returned status {resp.status_code}")
+                return []
+            html = resp.text
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html, "html.parser")
+        results = []
+        # Brave uses various container classes
+        for item in soup.select(".snippet, [data-type='web']"):
+            title_el = item.select_one(".snippet-title, .title, a[href^='http']")
+            desc_el = item.select_one(".snippet-description, .snippet-content, .description")
+            if not title_el:
+                continue
+            title = title_el.get_text(strip=True)
+            href = title_el.get("href", "")
+            # Find the actual URL
+            if not href.startswith("http"):
+                link = item.select_one("a[href^='http']")
+                if link:
+                    href = link.get("href", "")
+            snippet = desc_el.get_text(strip=True) if desc_el else ""
+            if title and href and href.startswith("http") and "brave.com" not in href:
+                results.append({"title": title, "url": href, "snippet": snippet})
+            if len(results) >= n:
+                break
+        logger.info(f"Brave Search parsed {len(results)} results")
+        return results
+    except Exception as e:
+        logger.error(f"Brave HTML search error: {type(e).__name__}: {e}")
+        return []
+# ═══════════════════════════════════════════════════
+#  Page content fetcher
+# ═══════════════════════════════════════════════════
 async def _fetch_pages(results: list[dict]) -> list[dict]:
+    """Fetch and extract text content from result URLs."""
    async def _fetch_one(r: dict) -> dict | None:
        try:
-            async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
+            async with httpx.AsyncClient(
-                resp = await client.get(r["url"], headers=_HEADERS)
+                timeout=12.0,
-                if resp.status_code != 200 or "text/html" not in resp.headers.get("content-type", ""):
+                follow_redirects=True,
+                http2=False,
+            ) as client:
+                resp = await client.get(r["url"], headers=_get_headers())
+                if resp.status_code != 200:
+                    return None
+                content_type = resp.headers.get("content-type", "")
+                if "text/html" not in content_type:
                    return None
+                html = resp.text
            from bs4 import BeautifulSoup
-            soup = BeautifulSoup(resp.text, "html.parser")
+            soup = BeautifulSoup(html, "html.parser")
-            for tag in soup(["script", "style", "nav", "footer", "header", "aside", "form", "noscript", "svg", "iframe"]):
+            # Remove non-content elements
+            for tag in soup(["script", "style", "nav", "footer", "header", "aside",
+                             "form", "noscript", "svg", "iframe", "button", "input",
+                             "select", "textarea", "menu", "[role='navigation']",
+                             "[role='banner']", "[role='complementary']"]):
                tag.decompose()
-            main = soup.select_one("main, article, [role='main'], .post-content, .entry-content, .article-body, #content")
+            # Try to find main content area
+            main = soup.select_one(
+                "main, article, [role='main'], .post-content, .entry-content, "
+                ".article-body, .article-content, #content, .content, "
+                ".post-body, .story-body, .page-content, #main-content"
+            )
            text = (main or soup.body or soup).get_text(separator="\n", strip=True)
-            lines = [l.strip() for l in text.split("\n") if l.strip() and len(l.strip()) > 20]
+            # Clean up — remove very short lines (navigation remnants)
+            lines = [l.strip() for l in text.split("\n") if l.strip() and len(l.strip()) > 15]
            text = "\n".join(lines)
            if len(text) > 200:
                return {"title": r["title"], "url": r["url"], "content": text}
-        except Exception:
-            pass
+        except Exception as e:
+            logger.debug(f"Failed to fetch {r.get('url', '?')}: {e}")
        return None
    tasks = [_fetch_one(r) for r in results]

--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ Pillow==11.1.0
 beautifulsoup4==4.12.3
 python-pptx==1.0.2
 python-docx==1.1.2
+duckduckgo-search>=7.0.0
\ No newline at end of file