Commit 37b9873f authored by Mahmoud Aglan's avatar Mahmoud Aglan

fixBig Scandal

parent 59950ea5
""" """
Background generation manager — v4.0.0 Enhanced Background generation manager — v4.1.0
Decouples AI generation from the SSE HTTP connection. Smart codebase loading for massive repos + persistent file context.
Full repository awareness + persistent attachment context.
""" """
import asyncio import asyncio
...@@ -17,10 +16,29 @@ from backend.services import bedrock_service, memory_service, rag_service, attac ...@@ -17,10 +16,29 @@ from backend.services import bedrock_service, memory_service, rag_service, attac
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
# Repo context cache — avoids re-fetching every message # Caches
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
_repo_cache: dict[str, tuple[float, str]] = {}
REPO_CACHE_TTL = 300 # 5 minutes # Tree cache: repo_id:branch → (timestamp, tree_list)
_tree_cache: dict[str, tuple[float, list[dict]]] = {}
TREE_CACHE_TTL = 600 # 10 minutes
# Tracks which files have been discussed per chat
_chat_file_history: dict[str, set[str]] = {}
def _get_tree_cache(repo_id: str, branch: str) -> list[dict] | None:
key = f"{repo_id}:{branch}"
if key in _tree_cache:
ts, tree = _tree_cache[key]
if time.time() - ts < TREE_CACHE_TTL:
return tree
return None
def _set_tree_cache(repo_id: str, branch: str, tree: list[dict]):
key = f"{repo_id}:{branch}"
_tree_cache[key] = (time.time(), tree)
@dataclass @dataclass
...@@ -84,12 +102,26 @@ class GenerationManager: ...@@ -84,12 +102,26 @@ class GenerationManager:
break break
await asyncio.sleep(0.02) await asyncio.sleep(0.02)
def invalidate_repo_cache(self, repo_id: str):
"""Call after a commit to force-refresh on next message."""
keys_to_remove = [k for k in _tree_cache if k.startswith(f"{repo_id}:")]
for k in keys_to_remove:
_tree_cache.pop(k, None)
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
# Build FULL repo context with file contents + cache # Smart repo context builder
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
async def _build_repo_context(self, db, chat) -> Optional[str]: async def _build_repo_context(
"""Load full repository file contents for project-aware conversations.""" self, db, chat, user_query: str
) -> Optional[str]:
"""
Build repo context using smart file selection.
For ANY size codebase:
1. Full file tree (paths only) — always included
2. Priority files (configs, entry points) — always loaded
3. Query-relevant files — loaded based on what user asked
"""
if not chat.linked_repo_id: if not chat.linked_repo_id:
return None return None
...@@ -101,64 +133,114 @@ class GenerationManager: ...@@ -101,64 +133,114 @@ class GenerationManager:
if not settings or not settings.is_active or not settings.gitlab_url or not settings.gitlab_token: if not settings or not settings.is_active or not settings.gitlab_url or not settings.gitlab_token:
return None return None
# Check cache gl_url = settings.gitlab_url
cache_key = f"{repo.id}:{repo.default_branch}" gl_token = settings.gitlab_token
now = time.time() branch = repo.default_branch
if cache_key in _repo_cache:
ts, ctx = _repo_cache[cache_key]
if now - ts < REPO_CACHE_TTL:
return ctx
try: try:
# Load full file contents (up to 80 files, 300K chars) # 1. Get tree (cached)
result = await gitlab_service.load_project_files( tree = _get_tree_cache(repo.id, branch)
settings.gitlab_url, settings.gitlab_token, if tree is None:
repo.gitlab_project_id, ref=repo.default_branch, tree = await gitlab_service.get_tree(
gl_url, gl_token, repo.gitlab_project_id,
ref=branch, recursive=True,
) )
_set_tree_cache(repo.id, branch, tree)
# 2. Get previously discussed files for this chat
prev_files = _chat_file_history.get(chat.id, set())
# 3. Smart-load files
result = await gitlab_service.load_smart_files(
gl_url, gl_token, repo.gitlab_project_id,
ref=branch, tree=tree,
user_query=user_query,
previous_files=prev_files,
)
# 4. Track loaded files for future messages
loaded_paths = set()
for f in result["priority_files"]:
loaded_paths.add(f["path"])
for f in result["query_files"]:
loaded_paths.add(f["path"])
if chat.id not in _chat_file_history:
_chat_file_history[chat.id] = set()
_chat_file_history[chat.id].update(loaded_paths)
# 5. Format the context
return self._format_smart_context(result, tree, repo)
context = self._format_full_repo_context(result, repo)
_repo_cache[cache_key] = (now, context)
return context
except Exception as e: except Exception as e:
# Fallback: try just the tree # Fallback: just the tree
try: try:
tree = await gitlab_service.get_tree( tree = await gitlab_service.get_tree(
settings.gitlab_url, settings.gitlab_token, gl_url, gl_token, repo.gitlab_project_id, ref=branch,
repo.gitlab_project_id, ref=repo.default_branch,
) )
return gitlab_service.format_tree_for_prompt(tree, repo.name, repo.default_branch) return gitlab_service.format_tree_for_prompt(tree, repo.name, branch)
except Exception: except Exception:
return f"[Repository: {repo.name} — could not load: {str(e)[:100]}]" return f"[Repository: {repo.name} — error: {str(e)[:200]}]"
def _format_smart_context(
self, result: dict, tree: list[dict], repo
) -> str:
"""Format loaded files into prompt context."""
# File tree
files_in_tree = sorted(
[i["path"] for i in tree if i["type"] == "blob"]
)
dirs_in_tree = sorted(
[i["path"] for i in tree if i["type"] == "tree"]
)
def _format_full_repo_context(self, result: dict, repo) -> str:
"""Format loaded project files into a prompt-friendly string."""
lines = [ lines = [
f"Repository: {repo.name}", f"Repository: {repo.name}",
f"Branch: {repo.default_branch}", f"Branch: {repo.default_branch}",
f"Path: {repo.path_with_namespace}", f"Path: {repo.path_with_namespace}",
f"Total files in tree: {result.get('total_files_in_tree', '?')}", f"Total files: {len(files_in_tree)} | Directories: {len(dirs_in_tree)}",
f"Files loaded: {result.get('files_loaded', '?')}", f"Files loaded into context: {result['files_loaded']}",
f"Total characters: {result.get('total_characters', '?')}", f"Characters loaded: {result['total_characters']:,}",
"", "",
"FILE CONTENTS:", "═" * 60,
"=" * 60, "COMPLETE FILE TREE (all paths):",
"═" * 60,
] ]
for fp in files_in_tree:
for f in result.get("files", []): lines.append(f" {fp}")
path = f.get("path", "?")
content = f.get("content", "") lines.append("")
lines.append(f"\n━━━ {path} ━━━") lines.append("═" * 60)
lines.append(content) lines.append("LOADED FILE CONTENTS:")
lines.append(f"━━━ end {path} ━━━") lines.append("═" * 60)
# Priority files
if result["priority_files"]:
lines.append("")
lines.append("── Config & Entry Point Files ──")
for f in result["priority_files"]:
lines.append(f"\n━━━ {f['path']} ━━━")
lines.append(f["content"])
lines.append(f"━━━ end {f['path']} ━━━")
# Query-relevant files
if result["query_files"]:
lines.append("")
lines.append("── Files Relevant to Current Question ──")
for f in result["query_files"]:
lines.append(f"\n━━━ {f['path']} ━━━")
lines.append(f["content"])
lines.append(f"━━━ end {f['path']} ━━━")
# Note about unloaded files
unloaded = len(files_in_tree) - result["files_loaded"]
if unloaded > 0:
lines.append("")
lines.append(f"NOTE: {unloaded} additional files exist in the repository.")
lines.append("If you need to see a specific file, ask the user to mention it by name.")
lines.append("You can see ALL file paths in the tree above.")
return "\n".join(lines) return "\n".join(lines)
def invalidate_repo_cache(self, repo_id: str):
"""Call when a commit is made to refresh the cache."""
keys_to_remove = [k for k in _repo_cache if k.startswith(f"{repo_id}:")]
for k in keys_to_remove:
_repo_cache.pop(k, None)
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
# Main generation loop # Main generation loop
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
...@@ -184,7 +266,7 @@ class GenerationManager: ...@@ -184,7 +266,7 @@ class GenerationManager:
db_user = db.query(User).filter(User.id == user_id).first() db_user = db.query(User).filter(User.id == user_id).first()
# Quota check + reset # Quota reset
now = datetime.utcnow() now = datetime.utcnow()
if db_user.quota_reset_date and now >= db_user.quota_reset_date: if db_user.quota_reset_date and now >= db_user.quota_reset_date:
db_user.tokens_used_this_month = 0 db_user.tokens_used_this_month = 0
...@@ -198,7 +280,7 @@ class GenerationManager: ...@@ -198,7 +280,7 @@ class GenerationManager:
state.events.append({"type": "error", "message": "Monthly token quota exceeded."}) state.events.append({"type": "error", "message": "Monthly token quota exceeded."})
return return
# Process current message attachments # Process attachments
attachments = [] attachments = []
if attachment_ids: if attachment_ids:
attachments = ( attachments = (
...@@ -223,7 +305,7 @@ class GenerationManager: ...@@ -223,7 +305,7 @@ class GenerationManager:
if attachments: if attachments:
db.commit() db.commit()
# RAG context # RAG
kb_id = knowledge_base_id or chat.knowledge_base_id kb_id = knowledge_base_id or chat.knowledge_base_id
rag_context = None rag_context = None
if kb_id: if kb_id:
...@@ -232,13 +314,13 @@ class GenerationManager: ...@@ -232,13 +314,13 @@ class GenerationManager:
except Exception: except Exception:
pass pass
# ── FULL REPO CONTEXT (loads all file contents) ── # ── SMART REPO CONTEXT (query-aware file loading) ──
repo_context = await self._build_repo_context(db, chat) repo_context = await self._build_repo_context(db, chat, content)
# ── PERSISTENT ATTACHMENT CONTEXT (all files in chat) ── # ── PERSISTENT ATTACHMENT CONTEXT ──
attachment_context = memory_service.gather_attachment_context(chat_id, db) attachment_context = memory_service.gather_attachment_context(chat_id, db)
# Build system prompt with ALL context # Build system prompt
system_prompt = build_full_prompt( system_prompt = build_full_prompt(
rag_context=rag_context, rag_context=rag_context,
repo_context=repo_context, repo_context=repo_context,
...@@ -248,13 +330,13 @@ class GenerationManager: ...@@ -248,13 +330,13 @@ class GenerationManager:
# Build conversation messages # Build conversation messages
messages = memory_service.build_messages(chat, db) messages = memory_service.build_messages(chat, db)
# Inject current message's multimodal content blocks # Inject multimodal content blocks for current attachments
if attachments and messages and messages[-1]["role"] == "user": if attachments and messages and messages[-1]["role"] == "user":
content_blocks = attachment_service.build_claude_content_blocks(attachments) content_blocks = attachment_service.build_claude_content_blocks(attachments)
content_blocks.append({"type": "text", "text": content}) content_blocks.append({"type": "text", "text": content})
messages[-1]["content"] = content_blocks messages[-1]["content"] = content_blocks
# Thinking / reasoning config # Thinking config
effective_max = max_tokens effective_max = max_tokens
thinking_config = None thinking_config = None
if reasoning_budget > 0: if reasoning_budget > 0:
...@@ -322,7 +404,7 @@ class GenerationManager: ...@@ -322,7 +404,7 @@ class GenerationManager:
state.message_id = assistant_msg.id state.message_id = assistant_msg.id
# Auto-generate title for first message # Auto-title
msg_count = db.query(Message).filter(Message.chat_id == chat_id).count() msg_count = db.query(Message).filter(Message.chat_id == chat_id).count()
if msg_count <= 2 and chat.title == "New Chat": if msg_count <= 2 and chat.title == "New Chat":
try: try:
......
""" """
GitLab CE REST API v4 client. GitLab CE REST API v4 client — Enhanced for massive codebases.
Uses httpx (already in requirements) for all HTTP calls. Smart file selection for repos up to 10M+ tokens.
All functions are async so they work cleanly in FastAPI async endpoints.
""" """
import base64 import base64
import json import json
import re
from typing import Optional from typing import Optional
from urllib.parse import quote from urllib.parse import quote
...@@ -20,15 +20,10 @@ class GitLabError(Exception): ...@@ -20,15 +20,10 @@ class GitLabError(Exception):
def _timeout(): def _timeout():
return httpx.Timeout(connect=15.0, read=60.0, write=30.0, pool=30.0) return httpx.Timeout(connect=15.0, read=120.0, write=30.0, pool=30.0)
async def _request( async def _request(method: str, url: str, token: str, **kwargs) -> dict | list:
method: str,
url: str,
token: str,
**kwargs,
) -> dict | list:
async with httpx.AsyncClient(timeout=_timeout()) as client: async with httpx.AsyncClient(timeout=_timeout()) as client:
headers = {"Private-Token": token} headers = {"Private-Token": token}
resp = await client.request(method, url, headers=headers, **kwargs) resp = await client.request(method, url, headers=headers, **kwargs)
...@@ -94,19 +89,15 @@ async def create_project( ...@@ -94,19 +89,15 @@ async def create_project(
) -> dict: ) -> dict:
url = _api(gitlab_url, "/projects") url = _api(gitlab_url, "/projects")
body = { body = {
"name": name, "name": name, "description": description,
"description": description, "visibility": visibility, "initialize_with_readme": initialize_with_readme,
"visibility": visibility,
"initialize_with_readme": initialize_with_readme,
} }
data = await _request("POST", url, token, json=body) data = await _request("POST", url, token, json=body)
return { return {
"id": data["id"], "id": data["id"], "name": data["name"],
"name": data["name"],
"path_with_namespace": data["path_with_namespace"], "path_with_namespace": data["path_with_namespace"],
"default_branch": data.get("default_branch", "main"), "default_branch": data.get("default_branch", "main"),
"web_url": data.get("web_url", ""), "web_url": data.get("web_url", ""), "description": data.get("description") or "",
"description": data.get("description") or "",
} }
...@@ -114,20 +105,17 @@ async def get_project(gitlab_url: str, token: str, project_id: int) -> dict: ...@@ -114,20 +105,17 @@ async def get_project(gitlab_url: str, token: str, project_id: int) -> dict:
url = _api(gitlab_url, f"/projects/{project_id}") url = _api(gitlab_url, f"/projects/{project_id}")
data = await _request("GET", url, token) data = await _request("GET", url, token)
return { return {
"id": data["id"], "id": data["id"], "name": data["name"],
"name": data["name"],
"path_with_namespace": data["path_with_namespace"], "path_with_namespace": data["path_with_namespace"],
"default_branch": data.get("default_branch", "main"), "default_branch": data.get("default_branch", "main"),
"web_url": data.get("web_url", ""), "web_url": data.get("web_url", ""), "description": data.get("description") or "",
"description": data.get("description") or "",
"last_activity_at": data.get("last_activity_at", ""), "last_activity_at": data.get("last_activity_at", ""),
"forks_count": data.get("forks_count", 0), "forks_count": data.get("forks_count", 0), "star_count": data.get("star_count", 0),
"star_count": data.get("star_count", 0),
} }
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
# Repository Tree # Repository Tree — loads ALL paths
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
async def get_tree( async def get_tree(
...@@ -138,7 +126,10 @@ async def get_tree( ...@@ -138,7 +126,10 @@ async def get_tree(
all_items = [] all_items = []
page = 1 page = 1
while True: while True:
params = {"ref": ref, "per_page": 100, "page": page, "recursive": str(recursive).lower()} params = {
"ref": ref, "per_page": 100, "page": page,
"recursive": str(recursive).lower(),
}
if path: if path:
params["path"] = path params["path"] = path
url = _api(gitlab_url, f"/projects/{project_id}/repository/tree") url = _api(gitlab_url, f"/projects/{project_id}/repository/tree")
...@@ -152,7 +143,7 @@ async def get_tree( ...@@ -152,7 +143,7 @@ async def get_tree(
if len(data) < 100: if len(data) < 100:
break break
page += 1 page += 1
if page > 20: if page > 500: # Up to 50K files
break break
return [ return [
{"name": i["name"], "path": i["path"], "type": i["type"], "mode": i.get("mode", "")} {"name": i["name"], "path": i["path"], "type": i["type"], "mode": i.get("mode", "")}
...@@ -164,18 +155,22 @@ def format_tree_for_prompt(items: list[dict], repo_name: str, branch: str) -> st ...@@ -164,18 +155,22 @@ def format_tree_for_prompt(items: list[dict], repo_name: str, branch: str) -> st
if not items: if not items:
return f"[Repository: {repo_name} ({branch}) — empty or inaccessible]" return f"[Repository: {repo_name} ({branch}) — empty or inaccessible]"
dirs = set()
files = [] files = []
dirs = set()
for item in sorted(items, key=lambda x: x["path"]): for item in sorted(items, key=lambda x: x["path"]):
if item["type"] == "tree": if item["type"] == "tree":
dirs.add(item["path"]) dirs.add(item["path"])
else: else:
files.append(item["path"]) files.append(item["path"])
lines = [f"Repository: {repo_name} (branch: {branch})", f"Total files: {len(files)}", ""] lines = [
f"Repository: {repo_name} (branch: {branch})",
f"Total files: {len(files)}",
f"Total directories: {len(dirs)}",
"",
]
for f in files: for f in files:
lines.append(f" {f}") lines.append(f" {f}")
return "\n".join(lines) return "\n".join(lines)
...@@ -233,25 +228,14 @@ async def get_file_raw( ...@@ -233,25 +228,14 @@ async def get_file_raw(
async def commit_files( async def commit_files(
gitlab_url: str, token: str, gitlab_url: str, token: str,
project_id: int, branch: str, project_id: int, branch: str,
commit_message: str, commit_message: str, actions: list[dict],
actions: list[dict],
) -> dict: ) -> dict:
"""
Atomic multi-file commit.
Each action: {"action": "create"|"update"|"delete", "file_path": "...", "content": "..."}
"""
url = _api(gitlab_url, f"/projects/{project_id}/repository/commits") url = _api(gitlab_url, f"/projects/{project_id}/repository/commits")
body = { body = {"branch": branch, "commit_message": commit_message, "actions": actions}
"branch": branch,
"commit_message": commit_message,
"actions": actions,
}
data = await _request("POST", url, token, json=body) data = await _request("POST", url, token, json=body)
return { return {
"id": data.get("id", ""), "id": data.get("id", ""), "short_id": data.get("short_id", ""),
"short_id": data.get("short_id", ""), "message": data.get("message", ""), "web_url": data.get("web_url", ""),
"message": data.get("message", ""),
"web_url": data.get("web_url", ""),
} }
...@@ -259,8 +243,7 @@ async def commit_single_file( ...@@ -259,8 +243,7 @@ async def commit_single_file(
gitlab_url: str, token: str, gitlab_url: str, token: str,
project_id: int, branch: str, project_id: int, branch: str,
file_path: str, content: str, file_path: str, content: str,
commit_message: str, commit_message: str, action: str = "update",
action: str = "update",
) -> dict: ) -> dict:
actions = [{"action": action, "file_path": file_path, "content": content}] actions = [{"action": action, "file_path": file_path, "content": content}]
return await commit_files(gitlab_url, token, project_id, branch, commit_message, actions) return await commit_files(gitlab_url, token, project_id, branch, commit_message, actions)
...@@ -270,17 +253,13 @@ async def commit_single_file( ...@@ -270,17 +253,13 @@ async def commit_single_file(
# Branches # Branches
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
async def list_branches( async def list_branches(gitlab_url: str, token: str, project_id: int) -> list[dict]:
gitlab_url: str, token: str,
project_id: int,
) -> list[dict]:
url = _api(gitlab_url, f"/projects/{project_id}/repository/branches") url = _api(gitlab_url, f"/projects/{project_id}/repository/branches")
params = {"per_page": 100} params = {"per_page": 100}
data = await _request("GET", url, token, params=params) data = await _request("GET", url, token, params=params)
return [ return [
{ {
"name": b["name"], "name": b["name"], "default": b.get("default", False),
"default": b.get("default", False),
"web_url": b.get("web_url", ""), "web_url": b.get("web_url", ""),
"commit_short_id": b.get("commit", {}).get("short_id", ""), "commit_short_id": b.get("commit", {}).get("short_id", ""),
"commit_message": (b.get("commit", {}).get("message") or "")[:100], "commit_message": (b.get("commit", {}).get("message") or "")[:100],
...@@ -291,8 +270,7 @@ async def list_branches( ...@@ -291,8 +270,7 @@ async def list_branches(
async def create_branch( async def create_branch(
gitlab_url: str, token: str, gitlab_url: str, token: str,
project_id: int, project_id: int, branch_name: str, ref: str = "main",
branch_name: str, ref: str = "main",
) -> dict: ) -> dict:
url = _api(gitlab_url, f"/projects/{project_id}/repository/branches") url = _api(gitlab_url, f"/projects/{project_id}/repository/branches")
body = {"branch": branch_name, "ref": ref} body = {"branch": branch_name, "ref": ref}
...@@ -306,30 +284,23 @@ async def create_branch( ...@@ -306,30 +284,23 @@ async def create_branch(
async def create_merge_request( async def create_merge_request(
gitlab_url: str, token: str, gitlab_url: str, token: str,
project_id: int, project_id: int, source_branch: str, target_branch: str,
source_branch: str, title: str, description: str = "",
target_branch: str,
title: str,
description: str = "",
) -> dict: ) -> dict:
url = _api(gitlab_url, f"/projects/{project_id}/merge_requests") url = _api(gitlab_url, f"/projects/{project_id}/merge_requests")
body = { body = {
"source_branch": source_branch, "source_branch": source_branch, "target_branch": target_branch,
"target_branch": target_branch, "title": title, "description": description,
"title": title,
"description": description,
} }
data = await _request("POST", url, token, json=body) data = await _request("POST", url, token, json=body)
return { return {
"iid": data.get("iid"), "iid": data.get("iid"), "title": data.get("title", ""),
"title": data.get("title", ""), "web_url": data.get("web_url", ""), "state": data.get("state", ""),
"web_url": data.get("web_url", ""),
"state": data.get("state", ""),
} }
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
# Bulk Load for Analysis # SMART FILE SELECTION — the core of large repo support
# ═══════════════════════════════════════════════════ # ═══════════════════════════════════════════════════
TEXT_EXTENSIONS = { TEXT_EXTENSIONS = {
...@@ -338,10 +309,288 @@ TEXT_EXTENSIONS = { ...@@ -338,10 +309,288 @@ TEXT_EXTENSIONS = {
".gd", ".html", ".css", ".scss", ".json", ".yaml", ".yml", ".xml", ".gd", ".html", ".css", ".scss", ".json", ".yaml", ".yml", ".xml",
".toml", ".ini", ".cfg", ".sh", ".bash", ".sql", ".md", ".txt", ".toml", ".ini", ".cfg", ".sh", ".bash", ".sql", ".md", ".txt",
".env", ".dockerfile", ".vue", ".svelte", ".dart", ".r", ".csv", ".env", ".dockerfile", ".vue", ".svelte", ".dart", ".r", ".csv",
".graphql", ".proto", ".tf", ".hcl", ".gradle", ".cmake",
".makefile", ".rake", ".gemspec", ".lock", ".mod", ".sum",
".csproj", ".sln", ".props", ".targets", ".fsproj",
} }
MAX_ANALYSIS_FILES = 80 # Files that are ALWAYS loaded if they exist (project understanding)
MAX_ANALYSIS_CHARS = 300_000 PRIORITY_PATTERNS = [
# Package manifests & configs
r"^package\.json$", r"^package-lock\.json$", r"^yarn\.lock$",
r"^requirements\.txt$", r"^setup\.py$", r"^setup\.cfg$", r"^pyproject\.toml$",
r"^Pipfile$", r"^Cargo\.toml$", r"^go\.mod$", r"^go\.sum$",
r"^Gemfile$", r"^pom\.xml$", r"^build\.gradle$",
r"^\.csproj$", r".*\.sln$",
# Docker & CI
r"^Dockerfile", r"^docker-compose", r"^\.dockerignore$",
r"^\.gitlab-ci\.yml$", r"^\.github/", r"^Jenkinsfile$",
# Config files
r"^tsconfig.*\.json$", r"^vite\.config", r"^webpack\.config",
r"^tailwind\.config", r"^postcss\.config", r"^babel\.config",
r"^\.eslintrc", r"^\.prettierrc", r"^\.editorconfig$",
r"^next\.config", r"^nuxt\.config", r"^angular\.json$",
r"^\.env\.example$", r"^\.env\.sample$",
# Docs
r"^README", r"^CHANGELOG", r"^CONTRIBUTING",
r"^docs/.*\.md$",
# Entry points
r"^main\.", r"^index\.", r"^app\.", r"^server\.",
r"^Program\.", r"^Startup\.",
r"^src/main\.", r"^src/index\.", r"^src/app\.", r"^src/App\.",
r"^backend/main\.", r"^frontend/src/main\.", r"^frontend/src/App\.",
r"^cmd/", r"^internal/",
# Database
r".*migrations/.*", r".*models\.", r".*schema\.",
]
# Maximum characters for file contents in the prompt
MAX_SMART_CHARS = 600_000 # ~150K tokens
MAX_PRIORITY_CHARS = 150_000 # Reserve for priority files
MAX_QUERY_CHARS = 450_000 # For query-relevant files
MAX_FILES_TO_LOAD = 300 # Hard cap on number of files
MAX_SINGLE_FILE = 50_000 # Skip files larger than this
def _is_text_file(path: str) -> bool:
"""Check if a file path looks like a text file we can load."""
lower = path.lower()
name = lower.rsplit("/", 1)[-1] if "/" in lower else lower
# Check known text filenames
if name in {
"dockerfile", "makefile", "gemfile", "rakefile", "procfile",
"vagrantfile", "jenkinsfile", "brewfile", ".gitignore",
".dockerignore", ".env.example", ".env.sample",
}:
return True
# Check extension
if "." in name:
ext = "." + name.rsplit(".", 1)[-1]
return ext in TEXT_EXTENSIONS
return False
def _is_priority_file(path: str) -> bool:
"""Check if file matches a priority pattern (should always be loaded)."""
for pattern in PRIORITY_PATTERNS:
if re.search(pattern, path, re.IGNORECASE):
return True
return False
def _score_file_for_query(path: str, query: str, previous_files: set[str]) -> float:
"""Score how relevant a file is to the user's message."""
if not query:
return 0.0
score = 0.0
lower_path = path.lower()
lower_query = query.lower()
path_parts = set(lower_path.replace("/", " ").replace(".", " ").replace("_", " ").replace("-", " ").split())
name = lower_path.rsplit("/", 1)[-1] if "/" in lower_path else lower_path
name_no_ext = name.rsplit(".", 1)[0] if "." in name else name
dir_parts = lower_path.rsplit("/", 1)[0].split("/") if "/" in lower_path else []
# Exact path mentioned in query
if lower_path in lower_query or path in query:
score += 200
# Filename mentioned
if name in lower_query or name_no_ext in lower_query:
score += 100
# Directory mentioned
for d in dir_parts:
if len(d) > 2 and d in lower_query:
score += 40
# Individual path component words in query
query_words = set(re.findall(r'[a-z]{3,}', lower_query))
for part in path_parts:
if len(part) > 2 and part in query_words:
score += 15
# Technology/framework keywords → matching extensions
tech_ext_map = {
"react": {".jsx", ".tsx", ".js"},
"vue": {".vue"},
"svelte": {".svelte"},
"python": {".py"},
"django": {".py"},
"flask": {".py"},
"fastapi": {".py"},
"typescript": {".ts", ".tsx"},
"javascript": {".js", ".jsx"},
"rust": {".rs"},
"go": {".go"},
"java": {".java"},
"csharp": {".cs"},
"c#": {".cs"},
"unity": {".cs"},
"ruby": {".rb"},
"php": {".php"},
"swift": {".swift"},
"kotlin": {".kt"},
"dart": {".dart"},
"sql": {".sql"},
"docker": {"dockerfile", ".yml", ".yaml"},
"css": {".css", ".scss"},
"html": {".html"},
"api": {".py", ".js", ".ts", ".go", ".rs"},
"route": {".py", ".js", ".ts", ".go"},
"model": {".py", ".cs", ".java", ".ts"},
"test": {".py", ".js", ".ts", ".java", ".cs"},
"config": {".json", ".yaml", ".yml", ".toml", ".ini"},
"database": {".sql", ".py"},
"migration": {".sql", ".py"},
"component": {".jsx", ".tsx", ".vue", ".svelte"},
"style": {".css", ".scss"},
}
ext = ""
if "." in name:
ext = "." + name.rsplit(".", 1)[-1]
for keyword, extensions in tech_ext_map.items():
if keyword in lower_query:
if ext in extensions or name in extensions:
score += 20
# Bonus for files discussed previously in this chat
if path in previous_files:
score += 30
# Bonus for files in common important directories
important_dirs = {"src", "lib", "app", "api", "routes", "models", "services", "components", "pages", "views", "controllers", "utils", "helpers", "config", "tests", "test"}
for d in dir_parts:
if d in important_dirs:
score += 5
return score
async def load_smart_files(
gitlab_url: str,
token: str,
project_id: int,
ref: str,
tree: list[dict],
user_query: str = "",
previous_files: set[str] | None = None,
) -> dict:
"""
Smart file loader for massive codebases.
Returns:
{
"tree_summary": str, # Full file tree (paths only)
"priority_files": [...], # Always-loaded config/entry files
"query_files": [...], # Files relevant to user's question
"total_files_in_tree": int,
"files_loaded": int,
"total_characters": int,
"skipped_large": int,
}
"""
if previous_files is None:
previous_files = set()
# Separate text files from binary
text_files = []
for item in tree:
if item["type"] != "blob":
continue
if _is_text_file(item["path"]):
text_files.append(item["path"])
# Split into priority and regular files
priority_paths = []
regular_paths = []
for fp in text_files:
if _is_priority_file(fp):
priority_paths.append(fp)
else:
regular_paths.append(fp)
# Score regular files by relevance to user query
scored = []
for fp in regular_paths:
s = _score_file_for_query(fp, user_query, previous_files)
scored.append((s, fp))
# Sort by score descending
scored.sort(key=lambda x: -x[0])
# Load priority files first
priority_loaded = []
priority_chars = 0
skipped_large = 0
for fp in priority_paths[:100]: # Cap at 100 priority files
if priority_chars >= MAX_PRIORITY_CHARS:
break
try:
raw = await get_file_raw(gitlab_url, token, project_id, fp, ref=ref)
if len(raw) > MAX_SINGLE_FILE:
raw = raw[:MAX_SINGLE_FILE] + f"\n\n... [truncated — file is {len(raw)} chars]"
skipped_large += 1
priority_loaded.append({"path": fp, "content": raw})
priority_chars += len(raw)
except Exception:
priority_loaded.append({"path": fp, "content": "[Could not read file]"})
# Load query-relevant files
query_loaded = []
query_chars = 0
loaded_paths = {f["path"] for f in priority_loaded}
files_loaded_count = len(priority_loaded)
for score, fp in scored:
if files_loaded_count >= MAX_FILES_TO_LOAD:
break
if query_chars >= MAX_QUERY_CHARS:
break
if fp in loaded_paths:
continue
# Only load scored files (score > 0) OR if we have room and few files loaded
if score <= 0 and files_loaded_count > 50:
break
try:
raw = await get_file_raw(gitlab_url, token, project_id, fp, ref=ref)
if len(raw) > MAX_SINGLE_FILE:
raw = raw[:MAX_SINGLE_FILE] + f"\n\n... [truncated — file is {len(raw)} chars]"
skipped_large += 1
query_loaded.append({"path": fp, "content": raw, "score": score})
query_chars += len(raw)
loaded_paths.add(fp)
files_loaded_count += 1
except Exception:
query_loaded.append({"path": fp, "content": "[Could not read file]", "score": score})
files_loaded_count += 1
return {
"priority_files": priority_loaded,
"query_files": query_loaded,
"total_files_in_tree": len(text_files),
"total_binary_files": len(tree) - len(text_files),
"files_loaded": files_loaded_count,
"priority_chars": priority_chars,
"query_chars": query_chars,
"total_characters": priority_chars + query_chars,
"skipped_large": skipped_large,
}
# ═══════════════════════════════════════════════════
# Legacy bulk loader (used by /analyze endpoint)
# ═══════════════════════════════════════════════════
MAX_ANALYSIS_FILES = 200
MAX_ANALYSIS_CHARS = 600_000
async def load_project_files( async def load_project_files(
...@@ -354,12 +603,14 @@ async def load_project_files( ...@@ -354,12 +603,14 @@ async def load_project_files(
for item in tree: for item in tree:
if item["type"] != "blob": if item["type"] != "blob":
continue continue
ext = "" if _is_text_file(item["path"]):
name = item["path"].lower()
if "." in name:
ext = "." + name.rsplit(".", 1)[-1]
if ext in TEXT_EXTENSIONS or name in {"dockerfile", "makefile", "gemfile", "rakefile"}:
text_files.append(item["path"]) text_files.append(item["path"])
# Sort: priority files first, then alphabetical
def sort_key(fp):
return (0 if _is_priority_file(fp) else 1, fp)
text_files.sort(key=sort_key)
text_files = text_files[:MAX_ANALYSIS_FILES] text_files = text_files[:MAX_ANALYSIS_FILES]
contents = [] contents = []
...@@ -369,6 +620,8 @@ async def load_project_files( ...@@ -369,6 +620,8 @@ async def load_project_files(
break break
try: try:
raw = await get_file_raw(gitlab_url, token, project_id, fp, ref=ref) raw = await get_file_raw(gitlab_url, token, project_id, fp, ref=ref)
if len(raw) > MAX_SINGLE_FILE:
raw = raw[:MAX_SINGLE_FILE] + "\n... [truncated]"
if total_chars + len(raw) > MAX_ANALYSIS_CHARS: if total_chars + len(raw) > MAX_ANALYSIS_CHARS:
raw = raw[:MAX_ANALYSIS_CHARS - total_chars] + "\n... [truncated]" raw = raw[:MAX_ANALYSIS_CHARS - total_chars] + "\n... [truncated]"
contents.append({"path": fp, "content": raw}) contents.append({"path": fp, "content": raw})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment