Update 4 files via Son of Anton

57750e01 · Administrator · 6774dcff · 57750e01 · 57750e01 · 57750e01
Commit 57750e01 authored Mar 30, 2026 by Administrator
Hide whitespace changes
Inline Side-by-side

Showing with 208 additions and 473 deletions

config.py backend/config.py +7 -1

web_search_service.py backend/services/web_search_service.py +200 -470

requirements.txt requirements.txt +1 -2

No files found.
--- a/backend/config.py
+++ b/backend/config.py
@@ -37,4 +37,10 @@ BEDROCK_ENDPOINT: str = (
    f"https://bedrock-runtime.{AWS_REGION}.amazonaws.com"
 )

-APP_VERSION: str = "4.0.0"
\ No newline at end of file
+# SerpAPI for web search
+SERPAPI_KEY: str = os.getenv(
+    "SERPAPI_KEY",
+    "0f9efa98fb0fe7b27af609e8dd80e04c4af1e098ec81fe628a6d63aaaebe8bd6",
+)
+
+APP_VERSION: str = "4.2.0"
\ No newline at end of file
--- a/backend/services/web_search_service.py
+++ b/backend/services/web_search_service.py
 """
-Web Search Service — v4.1.1 — Robust multi-strategy with proper region handling.
-Primary: duckduckgo-search library (forced US-English)
-Fallback 1: DDG HTML scraping with anti-detection
-Fallback 2: DDG Lite scraping
-Fallback 3: Brave Search HTML scraping
+Web Search Service — v4.2.0 — SerpAPI (Google Search)
+Clean. Reliable. No scraping bullshit.
 """

-import re
 import asyncio
-import random
 import logging
-from urllib.parse import quote_plus, urlparse, parse_qs
-from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import urlencode

 import httpx

+from backend.config import SERPAPI_KEY
+
 logger = logging.getLogger("son_of_anton.web_search")

 # ═══════════════════════════════════════════════════
-#  duckduckgo-search library (primary)
+#  SerpAPI Configuration
 # ═══════════════════════════════════════════════════

-try:
-    from duckduckgo_search import DDGS
-    HAS_DDGS = True
-    logger.info("duckduckgo-search library available — using as primary search method")
-except ImportError:
-    HAS_DDGS = False
-    logger.warning("duckduckgo-search library NOT installed — falling back to HTML scraping")
-
-
-_executor = ThreadPoolExecutor(max_workers=2)
+SERPAPI_BASE = "https://serpapi.com/search"

 _USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
 ]

-# Domains that are NEVER useful in English search results
+# Domains we never want cluttering results
 BLOCKED_DOMAINS = {
    "baidu.com", "zhihu.com", "csdn.net", "bilibili.com", "weibo.com",
    "sogou.com", "163.com", "qq.com", "taobao.com", "jd.com",
    "douyin.com", "tiktok.com", "yandex.ru", "mail.ru",
-    "duckduckgo.com", "search.brave.com", "google.com/search",
 }


-def _get_headers():
-    return {
-        "User-Agent": random.choice(_USER_AGENTS),
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
-        "Accept-Language": "en-US,en;q=0.9",
-        "Accept-Encoding": "gzip, deflate, br",
-        "DNT": "1",
-        "Connection": "keep-alive",
-        "Upgrade-Insecure-Requests": "1",
-        "Sec-Fetch-Dest": "document",
-        "Sec-Fetch-Mode": "navigate",
-        "Sec-Fetch-Site": "none",
-        "Sec-Fetch-User": "?1",
-        "Cache-Control": "max-age=0",
-    }
-
-
 def _is_valid_result(url: str, title: str) -> bool:
-    """Filter out garbage results — non-English, ad domains, empty junk."""
+    """Filter garbage results."""
    if not url or not url.startswith("http"):
        return False
    if not title or len(title.strip()) < 3:
        return False
-
    try:
-        parsed = urlparse(url)
-        domain = parsed.netloc.lower().lstrip("www.")
+        from urllib.parse import urlparse
+        domain = urlparse(url).netloc.lower().lstrip("www.")
+        for blocked in BLOCKED_DOMAINS:
+            if domain == blocked or domain.endswith("." + blocked):
+                return False
    except Exception:
-        return False
-
-    # Block known garbage domains
-    for blocked in BLOCKED_DOMAINS:
-        if domain == blocked or domain.endswith("." + blocked):
-            return False
-
-    # Block results that are clearly search engine pages, not actual results
-    if "/search?" in url or "/search/" in url:
-        return False
-
+        pass
    return True


 # ═══════════════════════════════════════════════════
-#  Main entry point
+#  Main Entry Point
 # ═══════════════════════════════════════════════════

 async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) -> str:
    """
-    Search the web using multiple strategies with automatic fallback.
-    Returns formatted search results as a string for the LLM context.
+    Search the web via SerpAPI (Google).
+    Returns formatted search results string for LLM context.
    """
    logger.info(f"Web search initiated: query='{query[:80]}', num_results={num_results}")

-    results = []
-
-    # Strategy 1: duckduckgo-search library (most reliable)
-    if HAS_DDGS:
-        results = await _ddgs_library_search(query, num_results)
-        if results:
-            logger.info(f"DDGS library returned {len(results)} results")
-
-    # Strategy 2: DDG HTML scraping
-    if not results:
-        logger.info("DDGS library failed or unavailable, trying HTML scraping...")
-        results = await _ddg_html_search(query, num_results)
-        if results:
-            logger.info(f"DDG HTML scraping returned {len(results)} results")
-
-    # Strategy 3: DDG Lite
-    if not results:
-        logger.info("HTML scraping failed, trying DDG Lite...")
-        results = await _ddg_lite_search(query, num_results)
-        if results:
-            logger.info(f"DDG Lite returned {len(results)} results")
+    if not SERPAPI_KEY:
+        logger.error("SERPAPI_KEY not configured!")
+        return f"[Web search failed: SerpAPI key not configured. Answer from your own knowledge.]"

-    # Strategy 4: Brave Search
-    if not results:
-        logger.info("DDG Lite failed, trying Brave Search...")
-        results = await _brave_html_search(query, num_results)
-        if results:
-            logger.info(f"Brave HTML search returned {len(results)} results")
+    results = await _serpapi_search(query, num_results)

-    # Strategy 5: Google scraping (last resort)
    if not results:
-        logger.info("Brave failed, trying Google scraping...")
-        results = await _google_html_search(query, num_results)
-        if results:
-            logger.info(f"Google HTML search returned {len(results)} results")
+        logger.warning(f"SerpAPI returned no results for: '{query[:80]}'")
+        return f"[Web search for '{query}' returned no results. Answer from your own knowledge.]"

-    if not results:
-        logger.warning(f"ALL search strategies failed for query: '{query[:80]}'")
-        return f"[Web search for '{query}' returned no results. All search strategies exhausted. Answer from your own knowledge.]"
+    logger.info(f"SerpAPI returned {len(results)} results for: '{query[:60]}'")

    # Format results
    lines = [
        "═" * 60,
-        "WEB SEARCH RESULTS",
+        "WEB SEARCH RESULTS (via Google/SerpAPI)",
        f"Query: {query}",
        f"Results: {len(results)}",
        "═" * 60, "",
    ]
    for i, r in enumerate(results, 1):
        lines.append(f"[{i}] {r['title']}")
-        lines.append(f"    URL: {r['url']}")
+        if r.get("url"):
+            lines.append(f"    URL: {r['url']}")
        if r.get("snippet"):
            lines.append(f"    {r['snippet']}")
+        if r.get("date"):
+            lines.append(f"    Date: {r['date']}")
        lines.append("")

    # Fetch full page content for top results
    if fetch_pages > 0:
-        detailed = await _fetch_pages(results[:fetch_pages])
+        # Only fetch actual URLs (skip knowledge graph entries without URLs)
+        fetchable = [r for r in results if r.get("url") and r["url"].startswith("http")]
+        detailed = await _fetch_pages(fetchable[:fetch_pages])
        if detailed:
            lines.append("═" * 60)
            lines.append("DETAILED PAGE CONTENT:")
@@ -174,408 +109,203 @@ async def search_web(query: str, num_results: int = 8, fetch_pages: int = 3) ->


 # ═══════════════════════════════════════════════════
-#  Strategy 1: duckduckgo-search library
+#  SerpAPI Google Search
 # ═══════════════════════════════════════════════════

-def _ddgs_sync_search(query: str, n: int) -> list[dict]:
-    """Synchronous DDGS search — runs in thread executor."""
-    results = []
-
-    # Try multiple configurations in order
-    configs = [
-        {"region": "us-en", "backend": "auto"},
-        {"region": "us-en", "backend": "html"},
-        {"region": "us-en", "backend": "lite"},
-        {"region": "wt-wt", "backend": "auto"},
-    ]
-
-    for cfg in configs:
-        if results:
-            break
-        try:
-            with DDGS() as ddgs:
-                kwargs = {
-                    "keywords": query,
-                    "max_results": n + 5,  # Request extra, filter later
-                    "region": cfg["region"],
-                    "safesearch": "moderate",
-                }
-                # backend parameter exists in newer versions
-                try:
-                    kwargs["backend"] = cfg["backend"]
-                    raw = list(ddgs.text(**kwargs))
-                except TypeError:
-                    # Older version without backend parameter
-                    del kwargs["backend"]
-                    raw = list(ddgs.text(**kwargs))
-
-            for r in raw:
-                title = r.get("title", "").strip()
-                url = r.get("href", r.get("link", "")).strip()
-                snippet = r.get("body", r.get("snippet", "")).strip()
-
-                if _is_valid_result(url, title):
-                    results.append({"title": title, "url": url, "snippet": snippet})
-
-                if len(results) >= n:
-                    break
-
-            if results:
-                logger.info(f"DDGS config {cfg} returned {len(results)} valid results")
-                return results
-
-        except Exception as e:
-            logger.warning(f"DDGS config {cfg} failed: {type(e).__name__}: {e}")
-            continue
-
-    return results
-
-
-async def _ddgs_library_search(query: str, n: int) -> list[dict]:
-    """Run the synchronous DDGS search in a thread pool."""
-    if not HAS_DDGS:
-        return []
+async def _serpapi_search(query: str, n: int) -> list[dict]:
+    """Hit SerpAPI's Google Search endpoint and parse results."""
    try:
-        loop = asyncio.get_event_loop()
-        return await asyncio.wait_for(
-            loop.run_in_executor(_executor, _ddgs_sync_search, query, n),
-            timeout=30.0,
-        )
-    except asyncio.TimeoutError:
-        logger.error("DDGS library search timed out")
-        return []
-    except Exception as e:
-        logger.error(f"DDGS executor error: {e}")
-        return []
-
-
-# ═══════════════════════════════════════════════════
-#  Strategy 2: DDG HTML scraping (improved)
-# ═══════════════════════════════════════════════════
-
-async def _ddg_html_search(query: str, n: int) -> list[dict]:
-    try:
-        headers = _get_headers()
-        headers["Referer"] = "https://duckduckgo.com/"
-        headers["Origin"] = "https://html.duckduckgo.com"
-        headers["Content-Type"] = "application/x-www-form-urlencoded"
-
-        async with httpx.AsyncClient(
-            timeout=20.0,
-            follow_redirects=True,
-            http2=False,
-        ) as client:
-            # First, hit the main page to get cookies
-            try:
-                await client.get("https://html.duckduckgo.com/html/", headers=_get_headers())
-            except Exception:
-                pass
-
-            await asyncio.sleep(random.uniform(0.3, 1.0))
-
-            resp = await client.post(
-                "https://html.duckduckgo.com/html/",
-                data={"q": query, "b": "", "kl": "us-en", "kp": "-1"},
-                headers=headers,
-            )
+        params = {
+            "q": query,
+            "api_key": SERPAPI_KEY,
+            "engine": "google",
+            "num": min(n + 5, 20),
+            "hl": "en",
+            "gl": "us",
+            "safe": "off",
+            "no_cache": "false",
+        }
+
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            resp = await client.get(SERPAPI_BASE, params=params)

            if resp.status_code != 200:
-                logger.warning(f"DDG HTML returned status {resp.status_code}")
+                error_text = resp.text[:500]
+                logger.error(f"SerpAPI HTTP {resp.status_code}: {error_text}")
+                try:
+                    err_data = resp.json()
+                    logger.error(f"SerpAPI error: {err_data.get('error', 'unknown')}")
+                except Exception:
+                    pass
                return []

-            html = resp.text
+            data = resp.json()

-            if len(html) < 500:
-                logger.warning(f"DDG HTML response suspiciously short: {len(html)} chars")
-                return []
-
-            if "Please try again" in html or "bot" in html.lower()[:500]:
-                logger.warning("DDG HTML appears to have returned a bot-detection page")
-                return []
+        if "error" in data:
+            logger.error(f"SerpAPI error: {data['error']}")
+            return []

-        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(html, "html.parser")
        results = []

-        # Try multiple selector strategies
-        selectors = [
-            (".result__body", ".result__title a, .result__a", ".result__snippet"),
-            (".result", ".result__a", ".result__snippet"),
-            (".web-result", ".result__a", ".result__snippet"),
-            (".results .result", "a.result__a", ".result__snippet"),
-        ]
-
-        for container_sel, link_sel, snippet_sel in selectors:
-            elements = soup.select(container_sel)
-            if not elements:
-                continue
-
-            for el in elements:
-                title_a = el.select_one(link_sel)
-                snippet_el = el.select_one(snippet_sel)
-                if not title_a:
-                    continue
-
-                title = title_a.get_text(strip=True)
-                href = title_a.get("href", "")
-                snippet = snippet_el.get_text(strip=True) if snippet_el else ""
-
-                # DDG wraps URLs in a redirect
-                if "uddg=" in href:
-                    parsed = parse_qs(urlparse(href).query)
-                    href = parsed.get("uddg", [href])[0]
-                elif href.startswith("//duckduckgo.com/l/?"):
-                    parsed = parse_qs(urlparse(href).query)
-                    href = parsed.get("uddg", parsed.get("u", [href]))[0]
-
-                if _is_valid_result(href, title):
-                    results.append({"title": title, "url": href, "snippet": snippet})
-
-                if len(results) >= n:
-                    break
-
-            if results:
-                break
-
-        # Last resort: find ANY links that look like search results
-        if not results:
-            for a_tag in soup.find_all("a", href=True):
-                href = a_tag.get("href", "")
-                text = a_tag.get_text(strip=True)
-                if "uddg=" in href:
-                    parsed = parse_qs(urlparse(href).query)
-                    real_url = parsed.get("uddg", [""])[0]
-                    if _is_valid_result(real_url, text):
-                        results.append({"title": text, "url": real_url, "snippet": ""})
-                if len(results) >= n:
-                    break
-
-        logger.info(f"DDG HTML parsed {len(results)} results from {len(html)} chars of HTML")
-        return results
-
-    except Exception as e:
-        logger.error(f"DDG HTML search error: {type(e).__name__}: {e}")
-        return []
-
-
-# ═══════════════════════════════════════════════════
-#  Strategy 3: DDG Lite
-# ═══════════════════════════════════════════════════
-
-async def _ddg_lite_search(query: str, n: int) -> list[dict]:
-    try:
-        async with httpx.AsyncClient(
-            timeout=20.0,
-            follow_redirects=True,
-        ) as client:
-            resp = await client.post(
-                "https://lite.duckduckgo.com/lite/",
-                data={"q": query, "kl": "us-en"},
-                headers=_get_headers(),
+        # ── Answer box / Featured snippet (highest priority) ──
+        answer = data.get("answer_box", {})
+        if answer:
+            answer_text = (
+                answer.get("answer", "")
+                or answer.get("snippet", "")
+                or answer.get("result", "")
            )
-            if resp.status_code != 200:
-                logger.warning(f"DDG Lite returned status {resp.status_code}")
-                return []
-
-            html = resp.text
-
-        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(html, "html.parser")
-        results = []
-
-        for a_tag in soup.select("a.result-link"):
-            href = a_tag.get("href", "")
-            title = a_tag.get_text(strip=True)
-            if not _is_valid_result(href, title):
-                continue
-
-            snippet = ""
-            parent_tr = a_tag.find_parent("tr")
-            if parent_tr:
-                next_tr = parent_tr.find_next_sibling("tr")
-                if next_tr:
-                    snippet_td = next_tr.find("td", class_="result-snippet")
-                    if snippet_td:
-                        snippet = snippet_td.get_text(strip=True)[:300]
-                    elif next_tr:
-                        snippet = next_tr.get_text(strip=True)[:300]
-
-            results.append({"title": title, "url": href, "snippet": snippet})
-            if len(results) >= n:
+            if answer_text:
+                results.append({
+                    "title": f"[Featured Answer] {answer.get('title', query)}",
+                    "url": answer.get("link", ""),
+                    "snippet": str(answer_text)[:500],
+                    "date": "",
+                })
+
+        # ── Knowledge graph ──
+        kg = data.get("knowledge_graph", {})
+        if kg and kg.get("title"):
+            kg_parts = []
+            if kg.get("description"):
+                kg_parts.append(kg["description"])
+            if kg.get("type"):
+                kg_parts.append(f"Type: {kg['type']}")
+
+            for key in ["born", "died", "founded", "headquarters", "ceo",
+                        "revenue", "employees", "website", "nationality",
+                        "genre", "awards", "education", "height", "weight",
+                        "capital", "population", "area", "currency",
+                        "president", "prime_minister"]:
+                val = kg.get(key)
+                if val:
+                    kg_parts.append(f"{key.replace('_', ' ').title()}: {val}")
+
+            # Also grab "known attributes" list if present
+            for attr in kg.get("attributes", {}).items():
+                if len(kg_parts) < 15:
+                    kg_parts.append(f"{attr[0]}: {attr[1]}")
+
+            if kg_parts:
+                results.append({
+                    "title": f"[Knowledge Graph] {kg['title']}",
+                    "url": kg.get("website", kg.get("source", {}).get("link", "")),
+                    "snippet": " | ".join(kg_parts),
+                    "date": "",
+                })
+
+        # ── Organic results (main search results) ──
+        for item in data.get("organic_results", []):
+            title = item.get("title", "").strip()
+            url = item.get("link", "").strip()
+            snippet = item.get("snippet", "").strip()
+            date = item.get("date", "")
+
+            # Rich snippet extras
+            rich_snippet = item.get("rich_snippet", {})
+            if rich_snippet:
+                top = rich_snippet.get("top", {})
+                if top.get("detected_extensions", {}).get("rating"):
+                    snippet += f" | Rating: {top['detected_extensions']['rating']}"
+
+            if _is_valid_result(url, title):
+                results.append({
+                    "title": title,
+                    "url": url,
+                    "snippet": snippet,
+                    "date": date,
+                })
+
+            if len(results) >= n + 3:  # +3 for KG/answer/news
                break

-        # Fallback: any td a[href^='http']
-        if not results:
-            for a_tag in soup.select("td a[href^='http']"):
-                href = a_tag.get("href", "")
-                title = a_tag.get_text(strip=True)
-                if _is_valid_result(href, title) and len(title) > 5:
-                    results.append({"title": title, "url": href, "snippet": ""})
-                if len(results) >= n:
-                    break
-
-        return results
-
-    except Exception as e:
-        logger.error(f"DDG Lite search error: {type(e).__name__}: {e}")
-        return []
-
-
-# ═══════════════════════════════════════════════════
-#  Strategy 4: Brave Search HTML scraping
-# ═══════════════════════════════════════════════════
-
-async def _brave_html_search(query: str, n: int) -> list[dict]:
-    """Scrape Brave Search as a fallback."""
-    try:
-        headers = _get_headers()
-        headers["Referer"] = "https://search.brave.com/"
-
-        async with httpx.AsyncClient(
-            timeout=20.0,
-            follow_redirects=True,
-        ) as client:
-            resp = await client.get(
-                "https://search.brave.com/search",
-                params={"q": query, "source": "web"},
-                headers=headers,
-            )
-            if resp.status_code != 200:
-                logger.warning(f"Brave Search returned status {resp.status_code}")
-                return []
-
-            html = resp.text
-
-        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(html, "html.parser")
-        results = []
-
-        for item in soup.select(".snippet, [data-type='web']"):
-            title_el = item.select_one(".snippet-title, .title, a[href^='http']")
-            desc_el = item.select_one(".snippet-description, .snippet-content, .description")
-
-            if not title_el:
-                continue
-
-            title = title_el.get_text(strip=True)
-            href = title_el.get("href", "")
-
-            if not href.startswith("http"):
-                link = item.select_one("a[href^='http']")
-                if link:
-                    href = link.get("href", "")
-
-            snippet = desc_el.get_text(strip=True) if desc_el else ""
+        # ── Top stories (news) ──
+        for story in data.get("top_stories", [])[:3]:
+            title = story.get("title", "").strip()
+            url = story.get("link", "").strip()
+            source = story.get("source", "")
+            date = story.get("date", "")
+
+            if url and title:
+                results.append({
+                    "title": f"[News] {title}",
+                    "url": url,
+                    "snippet": f"Source: {source}" + (f" | {date}" if date else ""),
+                    "date": date,
+                })
+
+        # ── Related questions (People Also Ask) ──
+        paa = data.get("related_questions", [])
+        if paa:
+            paa_lines = []
+            for q in paa[:4]:
+                question = q.get("question", "")
+                paa_snippet = q.get("snippet", "")
+                if question:
+                    entry = question
+                    if paa_snippet:
+                        entry += f" → {paa_snippet[:150]}"
+                    paa_lines.append(entry)
+            if paa_lines:
+                results.append({
+                    "title": "[People Also Ask]",
+                    "url": "",
+                    "snippet": " | ".join(paa_lines),
+                    "date": "",
+                })
+
+        # ── Related searches ──
+        related = data.get("related_searches", [])
+        if related:
+            related_queries = [r.get("query", "") for r in related[:5] if r.get("query")]
+            if related_queries:
+                results.append({
+                    "title": "[Related Searches]",
+                    "url": "",
+                    "snippet": " | ".join(related_queries),
+                    "date": "",
+                })
+
+        logger.info(f"SerpAPI: {len(data.get('organic_results', []))} organic, "
+                     f"KG={'yes' if kg.get('title') else 'no'}, "
+                     f"answer={'yes' if answer else 'no'}, "
+                     f"news={len(data.get('top_stories', []))}, "
+                     f"total={len(results)}")

-            if _is_valid_result(href, title):
-                results.append({"title": title, "url": href, "snippet": snippet})
-
-            if len(results) >= n:
-                break
-
-        logger.info(f"Brave Search parsed {len(results)} results")
        return results

-    except Exception as e:
-        logger.error(f"Brave HTML search error: {type(e).__name__}: {e}")
+    except httpx.TimeoutException:
+        logger.error("SerpAPI request timed out")
        return []
-
-
-# ═══════════════════════════════════════════════════
-#  Strategy 5: Google HTML scraping (last resort)
-# ═══════════════════════════════════════════════════
-
-async def _google_html_search(query: str, n: int) -> list[dict]:
-    """Scrape Google search results as absolute last resort."""
-    try:
-        headers = _get_headers()
-        headers["Referer"] = "https://www.google.com/"
-
-        async with httpx.AsyncClient(
-            timeout=20.0,
-            follow_redirects=True,
-        ) as client:
-            resp = await client.get(
-                "https://www.google.com/search",
-                params={"q": query, "hl": "en", "gl": "us", "num": str(n + 5)},
-                headers=headers,
-            )
-            if resp.status_code != 200:
-                logger.warning(f"Google returned status {resp.status_code}")
-                return []
-
-            html = resp.text
-
-        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(html, "html.parser")
-        results = []
-
-        # Google search results are in divs with class 'g' or similar
-        for g_div in soup.select("div.g, div.tF2Cxc, div[data-sokoban-container]"):
-            # Find the link
-            link_el = g_div.select_one("a[href^='http']")
-            if not link_el:
-                continue
-
-            href = link_el.get("href", "")
-
-            # Clean Google redirect URLs
-            if href.startswith("/url?"):
-                parsed = parse_qs(urlparse(href).query)
-                href = parsed.get("q", parsed.get("url", [href]))[0]
-
-            # Find title
-            title_el = g_div.select_one("h3")
-            title = title_el.get_text(strip=True) if title_el else link_el.get_text(strip=True)
-
-            # Find snippet
-            snippet_el = g_div.select_one(".VwiC3b, .IsZvec, .s3v9rd, span.st")
-            snippet = snippet_el.get_text(strip=True) if snippet_el else ""
-
-            if _is_valid_result(href, title):
-                results.append({"title": title, "url": href, "snippet": snippet})
-
-            if len(results) >= n:
-                break
-
-        # Fallback: just find any h3 > a patterns
-        if not results:
-            for h3 in soup.find_all("h3"):
-                parent_a = h3.find_parent("a")
-                if parent_a:
-                    href = parent_a.get("href", "")
-                    if href.startswith("/url?"):
-                        parsed = parse_qs(urlparse(href).query)
-                        href = parsed.get("q", parsed.get("url", [href]))[0]
-                    title = h3.get_text(strip=True)
-                    if _is_valid_result(href, title):
-                        results.append({"title": title, "url": href, "snippet": ""})
-                    if len(results) >= n:
-                        break
-
-        logger.info(f"Google HTML parsed {len(results)} results")
-        return results
-
    except Exception as e:
-        logger.error(f"Google HTML search error: {type(e).__name__}: {e}")
+        logger.error(f"SerpAPI search error: {type(e).__name__}: {e}")
        return []


 # ═══════════════════════════════════════════════════
-#  Page content fetcher
+#  Page Content Fetcher
 # ═══════════════════════════════════════════════════

 async def _fetch_pages(results: list[dict]) -> list[dict]:
    """Fetch and extract text content from result URLs."""

    async def _fetch_one(r: dict) -> dict | None:
+        url = r.get("url", "")
+        if not url or not url.startswith("http"):
+            return None
+
        try:
            async with httpx.AsyncClient(
                timeout=12.0,
                follow_redirects=True,
                http2=False,
            ) as client:
-                resp = await client.get(r["url"], headers=_get_headers())
+                headers = {
+                    "User-Agent": _USER_AGENTS[0],
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                    "Accept-Language": "en-US,en;q=0.9",
+                }
+                resp = await client.get(url, headers=headers)

                if resp.status_code != 200:
                    return None
@@ -589,14 +319,14 @@ async def _fetch_pages(results: list[dict]) -> list[dict]:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(html, "html.parser")

-            # Remove non-content elements
+            # Remove noise
            for tag in soup(["script", "style", "nav", "footer", "header", "aside",
                             "form", "noscript", "svg", "iframe", "button", "input",
                             "select", "textarea", "menu", "[role='navigation']",
                             "[role='banner']", "[role='complementary']"]):
                tag.decompose()

-            # Try to find main content area
+            # Find main content
            main = soup.select_one(
                "main, article, [role='main'], .post-content, .entry-content, "
                ".article-body, .article-content, #content, .content, "
@@ -605,15 +335,15 @@ async def _fetch_pages(results: list[dict]) -> list[dict]:

            text = (main or soup.body or soup).get_text(separator="\n", strip=True)

-            # Clean up — remove very short lines (navigation remnants)
+            # Clean up nav remnants
            lines = [l.strip() for l in text.split("\n") if l.strip() and len(l.strip()) > 15]
            text = "\n".join(lines)

            if len(text) > 200:
-                return {"title": r["title"], "url": r["url"], "content": text}
+                return {"title": r["title"], "url": url, "content": text}

        except Exception as e:
-            logger.debug(f"Failed to fetch {r.get('url', '?')}: {e}")
+            logger.debug(f"Failed to fetch {url}: {e}")

        return None


--- a/requirements.txt
+++ b/requirements.txt
@@ -12,5 +12,4 @@ pydantic==2.10.4
 Pillow==11.1.0
 beautifulsoup4==4.12.3
 python-pptx==1.0.2
-python-docx==1.1.2
-duckduckgo-search>=7.0.0
\ No newline at end of file
+python-docx==1.1.2
\ No newline at end of file