Commit 37b9873f authored by Mahmoud Aglan's avatar Mahmoud Aglan

fixBig Scandal

parent 59950ea5
"""
Background generation manager — v4.0.0 Enhanced
Decouples AI generation from the SSE HTTP connection.
Full repository awareness + persistent attachment context.
Background generation manager — v4.1.0
Smart codebase loading for massive repos + persistent file context.
"""
import asyncio
......@@ -17,10 +16,29 @@ from backend.services import bedrock_service, memory_service, rag_service, attac
# ═══════════════════════════════════════════════════
# Repo context cache — avoids re-fetching every message
# Caches
# ═══════════════════════════════════════════════════
_repo_cache: dict[str, tuple[float, str]] = {}
REPO_CACHE_TTL = 300 # 5 minutes
# Tree cache: repo_id:branch → (timestamp, tree_list)
_tree_cache: dict[str, tuple[float, list[dict]]] = {}
TREE_CACHE_TTL = 600 # 10 minutes
# Tracks which files have been discussed per chat
_chat_file_history: dict[str, set[str]] = {}
def _get_tree_cache(repo_id: str, branch: str) -> list[dict] | None:
key = f"{repo_id}:{branch}"
if key in _tree_cache:
ts, tree = _tree_cache[key]
if time.time() - ts < TREE_CACHE_TTL:
return tree
return None
def _set_tree_cache(repo_id: str, branch: str, tree: list[dict]):
key = f"{repo_id}:{branch}"
_tree_cache[key] = (time.time(), tree)
@dataclass
......@@ -84,12 +102,26 @@ class GenerationManager:
break
await asyncio.sleep(0.02)
def invalidate_repo_cache(self, repo_id: str):
"""Call after a commit to force-refresh on next message."""
keys_to_remove = [k for k in _tree_cache if k.startswith(f"{repo_id}:")]
for k in keys_to_remove:
_tree_cache.pop(k, None)
# ═══════════════════════════════════════════════════
# Build FULL repo context with file contents + cache
# Smart repo context builder
# ═══════════════════════════════════════════════════
async def _build_repo_context(self, db, chat) -> Optional[str]:
"""Load full repository file contents for project-aware conversations."""
async def _build_repo_context(
self, db, chat, user_query: str
) -> Optional[str]:
"""
Build repo context using smart file selection.
For ANY size codebase:
1. Full file tree (paths only) — always included
2. Priority files (configs, entry points) — always loaded
3. Query-relevant files — loaded based on what user asked
"""
if not chat.linked_repo_id:
return None
......@@ -101,64 +133,114 @@ class GenerationManager:
if not settings or not settings.is_active or not settings.gitlab_url or not settings.gitlab_token:
return None
# Check cache
cache_key = f"{repo.id}:{repo.default_branch}"
now = time.time()
if cache_key in _repo_cache:
ts, ctx = _repo_cache[cache_key]
if now - ts < REPO_CACHE_TTL:
return ctx
gl_url = settings.gitlab_url
gl_token = settings.gitlab_token
branch = repo.default_branch
try:
# Load full file contents (up to 80 files, 300K chars)
result = await gitlab_service.load_project_files(
settings.gitlab_url, settings.gitlab_token,
repo.gitlab_project_id, ref=repo.default_branch,
# 1. Get tree (cached)
tree = _get_tree_cache(repo.id, branch)
if tree is None:
tree = await gitlab_service.get_tree(
gl_url, gl_token, repo.gitlab_project_id,
ref=branch, recursive=True,
)
_set_tree_cache(repo.id, branch, tree)
# 2. Get previously discussed files for this chat
prev_files = _chat_file_history.get(chat.id, set())
# 3. Smart-load files
result = await gitlab_service.load_smart_files(
gl_url, gl_token, repo.gitlab_project_id,
ref=branch, tree=tree,
user_query=user_query,
previous_files=prev_files,
)
context = self._format_full_repo_context(result, repo)
_repo_cache[cache_key] = (now, context)
return context
# 4. Track loaded files for future messages
loaded_paths = set()
for f in result["priority_files"]:
loaded_paths.add(f["path"])
for f in result["query_files"]:
loaded_paths.add(f["path"])
if chat.id not in _chat_file_history:
_chat_file_history[chat.id] = set()
_chat_file_history[chat.id].update(loaded_paths)
# 5. Format the context
return self._format_smart_context(result, tree, repo)
except Exception as e:
# Fallback: try just the tree
# Fallback: just the tree
try:
tree = await gitlab_service.get_tree(
settings.gitlab_url, settings.gitlab_token,
repo.gitlab_project_id, ref=repo.default_branch,
gl_url, gl_token, repo.gitlab_project_id, ref=branch,
)
return gitlab_service.format_tree_for_prompt(tree, repo.name, repo.default_branch)
return gitlab_service.format_tree_for_prompt(tree, repo.name, branch)
except Exception:
return f"[Repository: {repo.name} — could not load: {str(e)[:100]}]"
return f"[Repository: {repo.name} — error: {str(e)[:200]}]"
def _format_smart_context(
self, result: dict, tree: list[dict], repo
) -> str:
"""Format loaded files into prompt context."""
# File tree
files_in_tree = sorted(
[i["path"] for i in tree if i["type"] == "blob"]
)
dirs_in_tree = sorted(
[i["path"] for i in tree if i["type"] == "tree"]
)
def _format_full_repo_context(self, result: dict, repo) -> str:
"""Format loaded project files into a prompt-friendly string."""
lines = [
f"Repository: {repo.name}",
f"Branch: {repo.default_branch}",
f"Path: {repo.path_with_namespace}",
f"Total files in tree: {result.get('total_files_in_tree', '?')}",
f"Files loaded: {result.get('files_loaded', '?')}",
f"Total characters: {result.get('total_characters', '?')}",
f"Total files: {len(files_in_tree)} | Directories: {len(dirs_in_tree)}",
f"Files loaded into context: {result['files_loaded']}",
f"Characters loaded: {result['total_characters']:,}",
"",
"FILE CONTENTS:",
"=" * 60,
"═" * 60,
"COMPLETE FILE TREE (all paths):",
"═" * 60,
]
for f in result.get("files", []):
path = f.get("path", "?")
content = f.get("content", "")
lines.append(f"\n━━━ {path} ━━━")
lines.append(content)
lines.append(f"━━━ end {path} ━━━")
for fp in files_in_tree:
lines.append(f" {fp}")
lines.append("")
lines.append("═" * 60)
lines.append("LOADED FILE CONTENTS:")
lines.append("═" * 60)
# Priority files
if result["priority_files"]:
lines.append("")
lines.append("── Config & Entry Point Files ──")
for f in result["priority_files"]:
lines.append(f"\n━━━ {f['path']} ━━━")
lines.append(f["content"])
lines.append(f"━━━ end {f['path']} ━━━")
# Query-relevant files
if result["query_files"]:
lines.append("")
lines.append("── Files Relevant to Current Question ──")
for f in result["query_files"]:
lines.append(f"\n━━━ {f['path']} ━━━")
lines.append(f["content"])
lines.append(f"━━━ end {f['path']} ━━━")
# Note about unloaded files
unloaded = len(files_in_tree) - result["files_loaded"]
if unloaded > 0:
lines.append("")
lines.append(f"NOTE: {unloaded} additional files exist in the repository.")
lines.append("If you need to see a specific file, ask the user to mention it by name.")
lines.append("You can see ALL file paths in the tree above.")
return "\n".join(lines)
def invalidate_repo_cache(self, repo_id: str):
"""Call when a commit is made to refresh the cache."""
keys_to_remove = [k for k in _repo_cache if k.startswith(f"{repo_id}:")]
for k in keys_to_remove:
_repo_cache.pop(k, None)
# ═══════════════════════════════════════════════════
# Main generation loop
# ═══════════════════════════════════════════════════
......@@ -184,7 +266,7 @@ class GenerationManager:
db_user = db.query(User).filter(User.id == user_id).first()
# Quota check + reset
# Quota reset
now = datetime.utcnow()
if db_user.quota_reset_date and now >= db_user.quota_reset_date:
db_user.tokens_used_this_month = 0
......@@ -198,7 +280,7 @@ class GenerationManager:
state.events.append({"type": "error", "message": "Monthly token quota exceeded."})
return
# Process current message attachments
# Process attachments
attachments = []
if attachment_ids:
attachments = (
......@@ -223,7 +305,7 @@ class GenerationManager:
if attachments:
db.commit()
# RAG context
# RAG
kb_id = knowledge_base_id or chat.knowledge_base_id
rag_context = None
if kb_id:
......@@ -232,13 +314,13 @@ class GenerationManager:
except Exception:
pass
# ── FULL REPO CONTEXT (loads all file contents) ──
repo_context = await self._build_repo_context(db, chat)
# ── SMART REPO CONTEXT (query-aware file loading) ──
repo_context = await self._build_repo_context(db, chat, content)
# ── PERSISTENT ATTACHMENT CONTEXT (all files in chat) ──
# ── PERSISTENT ATTACHMENT CONTEXT ──
attachment_context = memory_service.gather_attachment_context(chat_id, db)
# Build system prompt with ALL context
# Build system prompt
system_prompt = build_full_prompt(
rag_context=rag_context,
repo_context=repo_context,
......@@ -248,13 +330,13 @@ class GenerationManager:
# Build conversation messages
messages = memory_service.build_messages(chat, db)
# Inject current message's multimodal content blocks
# Inject multimodal content blocks for current attachments
if attachments and messages and messages[-1]["role"] == "user":
content_blocks = attachment_service.build_claude_content_blocks(attachments)
content_blocks.append({"type": "text", "text": content})
messages[-1]["content"] = content_blocks
# Thinking / reasoning config
# Thinking config
effective_max = max_tokens
thinking_config = None
if reasoning_budget > 0:
......@@ -322,7 +404,7 @@ class GenerationManager:
state.message_id = assistant_msg.id
# Auto-generate title for first message
# Auto-title
msg_count = db.query(Message).filter(Message.chat_id == chat_id).count()
if msg_count <= 2 and chat.title == "New Chat":
try:
......
"""
GitLab CE REST API v4 client.
Uses httpx (already in requirements) for all HTTP calls.
All functions are async so they work cleanly in FastAPI async endpoints.
GitLab CE REST API v4 client — Enhanced for massive codebases.
Smart file selection for repos up to 10M+ tokens.
"""
import base64
import json
import re
from typing import Optional
from urllib.parse import quote
......@@ -20,15 +20,10 @@ class GitLabError(Exception):
def _timeout():
return httpx.Timeout(connect=15.0, read=60.0, write=30.0, pool=30.0)
return httpx.Timeout(connect=15.0, read=120.0, write=30.0, pool=30.0)
async def _request(
method: str,
url: str,
token: str,
**kwargs,
) -> dict | list:
async def _request(method: str, url: str, token: str, **kwargs) -> dict | list:
async with httpx.AsyncClient(timeout=_timeout()) as client:
headers = {"Private-Token": token}
resp = await client.request(method, url, headers=headers, **kwargs)
......@@ -94,19 +89,15 @@ async def create_project(
) -> dict:
url = _api(gitlab_url, "/projects")
body = {
"name": name,
"description": description,
"visibility": visibility,
"initialize_with_readme": initialize_with_readme,
"name": name, "description": description,
"visibility": visibility, "initialize_with_readme": initialize_with_readme,
}
data = await _request("POST", url, token, json=body)
return {
"id": data["id"],
"name": data["name"],
"id": data["id"], "name": data["name"],
"path_with_namespace": data["path_with_namespace"],
"default_branch": data.get("default_branch", "main"),
"web_url": data.get("web_url", ""),
"description": data.get("description") or "",
"web_url": data.get("web_url", ""), "description": data.get("description") or "",
}
......@@ -114,20 +105,17 @@ async def get_project(gitlab_url: str, token: str, project_id: int) -> dict:
url = _api(gitlab_url, f"/projects/{project_id}")
data = await _request("GET", url, token)
return {
"id": data["id"],
"name": data["name"],
"id": data["id"], "name": data["name"],
"path_with_namespace": data["path_with_namespace"],
"default_branch": data.get("default_branch", "main"),
"web_url": data.get("web_url", ""),
"description": data.get("description") or "",
"web_url": data.get("web_url", ""), "description": data.get("description") or "",
"last_activity_at": data.get("last_activity_at", ""),
"forks_count": data.get("forks_count", 0),
"star_count": data.get("star_count", 0),
"forks_count": data.get("forks_count", 0), "star_count": data.get("star_count", 0),
}
# ═══════════════════════════════════════════════════
# Repository Tree
# Repository Tree — loads ALL paths
# ═══════════════════════════════════════════════════
async def get_tree(
......@@ -138,7 +126,10 @@ async def get_tree(
all_items = []
page = 1
while True:
params = {"ref": ref, "per_page": 100, "page": page, "recursive": str(recursive).lower()}
params = {
"ref": ref, "per_page": 100, "page": page,
"recursive": str(recursive).lower(),
}
if path:
params["path"] = path
url = _api(gitlab_url, f"/projects/{project_id}/repository/tree")
......@@ -152,7 +143,7 @@ async def get_tree(
if len(data) < 100:
break
page += 1
if page > 20:
if page > 500: # Up to 50K files
break
return [
{"name": i["name"], "path": i["path"], "type": i["type"], "mode": i.get("mode", "")}
......@@ -164,18 +155,22 @@ def format_tree_for_prompt(items: list[dict], repo_name: str, branch: str) -> st
if not items:
return f"[Repository: {repo_name} ({branch}) — empty or inaccessible]"
dirs = set()
files = []
dirs = set()
for item in sorted(items, key=lambda x: x["path"]):
if item["type"] == "tree":
dirs.add(item["path"])
else:
files.append(item["path"])
lines = [f"Repository: {repo_name} (branch: {branch})", f"Total files: {len(files)}", ""]
lines = [
f"Repository: {repo_name} (branch: {branch})",
f"Total files: {len(files)}",
f"Total directories: {len(dirs)}",
"",
]
for f in files:
lines.append(f" {f}")
return "\n".join(lines)
......@@ -233,25 +228,14 @@ async def get_file_raw(
async def commit_files(
gitlab_url: str, token: str,
project_id: int, branch: str,
commit_message: str,
actions: list[dict],
commit_message: str, actions: list[dict],
) -> dict:
"""
Atomic multi-file commit.
Each action: {"action": "create"|"update"|"delete", "file_path": "...", "content": "..."}
"""
url = _api(gitlab_url, f"/projects/{project_id}/repository/commits")
body = {
"branch": branch,
"commit_message": commit_message,
"actions": actions,
}
body = {"branch": branch, "commit_message": commit_message, "actions": actions}
data = await _request("POST", url, token, json=body)
return {
"id": data.get("id", ""),
"short_id": data.get("short_id", ""),
"message": data.get("message", ""),
"web_url": data.get("web_url", ""),
"id": data.get("id", ""), "short_id": data.get("short_id", ""),
"message": data.get("message", ""), "web_url": data.get("web_url", ""),
}
......@@ -259,8 +243,7 @@ async def commit_single_file(
gitlab_url: str, token: str,
project_id: int, branch: str,
file_path: str, content: str,
commit_message: str,
action: str = "update",
commit_message: str, action: str = "update",
) -> dict:
actions = [{"action": action, "file_path": file_path, "content": content}]
return await commit_files(gitlab_url, token, project_id, branch, commit_message, actions)
......@@ -270,17 +253,13 @@ async def commit_single_file(
# Branches
# ═══════════════════════════════════════════════════
async def list_branches(
gitlab_url: str, token: str,
project_id: int,
) -> list[dict]:
async def list_branches(gitlab_url: str, token: str, project_id: int) -> list[dict]:
url = _api(gitlab_url, f"/projects/{project_id}/repository/branches")
params = {"per_page": 100}
data = await _request("GET", url, token, params=params)
return [
{
"name": b["name"],
"default": b.get("default", False),
"name": b["name"], "default": b.get("default", False),
"web_url": b.get("web_url", ""),
"commit_short_id": b.get("commit", {}).get("short_id", ""),
"commit_message": (b.get("commit", {}).get("message") or "")[:100],
......@@ -291,8 +270,7 @@ async def list_branches(
async def create_branch(
gitlab_url: str, token: str,
project_id: int,
branch_name: str, ref: str = "main",
project_id: int, branch_name: str, ref: str = "main",
) -> dict:
url = _api(gitlab_url, f"/projects/{project_id}/repository/branches")
body = {"branch": branch_name, "ref": ref}
......@@ -306,30 +284,23 @@ async def create_branch(
async def create_merge_request(
gitlab_url: str, token: str,
project_id: int,
source_branch: str,
target_branch: str,
title: str,
description: str = "",
project_id: int, source_branch: str, target_branch: str,
title: str, description: str = "",
) -> dict:
url = _api(gitlab_url, f"/projects/{project_id}/merge_requests")
body = {
"source_branch": source_branch,
"target_branch": target_branch,
"title": title,
"description": description,
"source_branch": source_branch, "target_branch": target_branch,
"title": title, "description": description,
}
data = await _request("POST", url, token, json=body)
return {
"iid": data.get("iid"),
"title": data.get("title", ""),
"web_url": data.get("web_url", ""),
"state": data.get("state", ""),
"iid": data.get("iid"), "title": data.get("title", ""),
"web_url": data.get("web_url", ""), "state": data.get("state", ""),
}
# ═══════════════════════════════════════════════════
# Bulk Load for Analysis
# SMART FILE SELECTION — the core of large repo support
# ═══════════════════════════════════════════════════
TEXT_EXTENSIONS = {
......@@ -338,10 +309,288 @@ TEXT_EXTENSIONS = {
".gd", ".html", ".css", ".scss", ".json", ".yaml", ".yml", ".xml",
".toml", ".ini", ".cfg", ".sh", ".bash", ".sql", ".md", ".txt",
".env", ".dockerfile", ".vue", ".svelte", ".dart", ".r", ".csv",
".graphql", ".proto", ".tf", ".hcl", ".gradle", ".cmake",
".makefile", ".rake", ".gemspec", ".lock", ".mod", ".sum",
".csproj", ".sln", ".props", ".targets", ".fsproj",
}
MAX_ANALYSIS_FILES = 80
MAX_ANALYSIS_CHARS = 300_000
# Files that are ALWAYS loaded if they exist (project understanding)
PRIORITY_PATTERNS = [
# Package manifests & configs
r"^package\.json$", r"^package-lock\.json$", r"^yarn\.lock$",
r"^requirements\.txt$", r"^setup\.py$", r"^setup\.cfg$", r"^pyproject\.toml$",
r"^Pipfile$", r"^Cargo\.toml$", r"^go\.mod$", r"^go\.sum$",
r"^Gemfile$", r"^pom\.xml$", r"^build\.gradle$",
r"^\.csproj$", r".*\.sln$",
# Docker & CI
r"^Dockerfile", r"^docker-compose", r"^\.dockerignore$",
r"^\.gitlab-ci\.yml$", r"^\.github/", r"^Jenkinsfile$",
# Config files
r"^tsconfig.*\.json$", r"^vite\.config", r"^webpack\.config",
r"^tailwind\.config", r"^postcss\.config", r"^babel\.config",
r"^\.eslintrc", r"^\.prettierrc", r"^\.editorconfig$",
r"^next\.config", r"^nuxt\.config", r"^angular\.json$",
r"^\.env\.example$", r"^\.env\.sample$",
# Docs
r"^README", r"^CHANGELOG", r"^CONTRIBUTING",
r"^docs/.*\.md$",
# Entry points
r"^main\.", r"^index\.", r"^app\.", r"^server\.",
r"^Program\.", r"^Startup\.",
r"^src/main\.", r"^src/index\.", r"^src/app\.", r"^src/App\.",
r"^backend/main\.", r"^frontend/src/main\.", r"^frontend/src/App\.",
r"^cmd/", r"^internal/",
# Database
r".*migrations/.*", r".*models\.", r".*schema\.",
]
# Maximum characters for file contents in the prompt
MAX_SMART_CHARS = 600_000 # ~150K tokens
MAX_PRIORITY_CHARS = 150_000 # Reserve for priority files
MAX_QUERY_CHARS = 450_000 # For query-relevant files
MAX_FILES_TO_LOAD = 300 # Hard cap on number of files
MAX_SINGLE_FILE = 50_000 # Skip files larger than this
def _is_text_file(path: str) -> bool:
"""Check if a file path looks like a text file we can load."""
lower = path.lower()
name = lower.rsplit("/", 1)[-1] if "/" in lower else lower
# Check known text filenames
if name in {
"dockerfile", "makefile", "gemfile", "rakefile", "procfile",
"vagrantfile", "jenkinsfile", "brewfile", ".gitignore",
".dockerignore", ".env.example", ".env.sample",
}:
return True
# Check extension
if "." in name:
ext = "." + name.rsplit(".", 1)[-1]
return ext in TEXT_EXTENSIONS
return False
def _is_priority_file(path: str) -> bool:
"""Check if file matches a priority pattern (should always be loaded)."""
for pattern in PRIORITY_PATTERNS:
if re.search(pattern, path, re.IGNORECASE):
return True
return False
def _score_file_for_query(path: str, query: str, previous_files: set[str]) -> float:
"""Score how relevant a file is to the user's message."""
if not query:
return 0.0
score = 0.0
lower_path = path.lower()
lower_query = query.lower()
path_parts = set(lower_path.replace("/", " ").replace(".", " ").replace("_", " ").replace("-", " ").split())
name = lower_path.rsplit("/", 1)[-1] if "/" in lower_path else lower_path
name_no_ext = name.rsplit(".", 1)[0] if "." in name else name
dir_parts = lower_path.rsplit("/", 1)[0].split("/") if "/" in lower_path else []
# Exact path mentioned in query
if lower_path in lower_query or path in query:
score += 200
# Filename mentioned
if name in lower_query or name_no_ext in lower_query:
score += 100
# Directory mentioned
for d in dir_parts:
if len(d) > 2 and d in lower_query:
score += 40
# Individual path component words in query
query_words = set(re.findall(r'[a-z]{3,}', lower_query))
for part in path_parts:
if len(part) > 2 and part in query_words:
score += 15
# Technology/framework keywords → matching extensions
tech_ext_map = {
"react": {".jsx", ".tsx", ".js"},
"vue": {".vue"},
"svelte": {".svelte"},
"python": {".py"},
"django": {".py"},
"flask": {".py"},
"fastapi": {".py"},
"typescript": {".ts", ".tsx"},
"javascript": {".js", ".jsx"},
"rust": {".rs"},
"go": {".go"},
"java": {".java"},
"csharp": {".cs"},
"c#": {".cs"},
"unity": {".cs"},
"ruby": {".rb"},
"php": {".php"},
"swift": {".swift"},
"kotlin": {".kt"},
"dart": {".dart"},
"sql": {".sql"},
"docker": {"dockerfile", ".yml", ".yaml"},
"css": {".css", ".scss"},
"html": {".html"},
"api": {".py", ".js", ".ts", ".go", ".rs"},
"route": {".py", ".js", ".ts", ".go"},
"model": {".py", ".cs", ".java", ".ts"},
"test": {".py", ".js", ".ts", ".java", ".cs"},
"config": {".json", ".yaml", ".yml", ".toml", ".ini"},
"database": {".sql", ".py"},
"migration": {".sql", ".py"},
"component": {".jsx", ".tsx", ".vue", ".svelte"},
"style": {".css", ".scss"},
}
ext = ""
if "." in name:
ext = "." + name.rsplit(".", 1)[-1]
for keyword, extensions in tech_ext_map.items():
if keyword in lower_query:
if ext in extensions or name in extensions:
score += 20
# Bonus for files discussed previously in this chat
if path in previous_files:
score += 30
# Bonus for files in common important directories
important_dirs = {"src", "lib", "app", "api", "routes", "models", "services", "components", "pages", "views", "controllers", "utils", "helpers", "config", "tests", "test"}
for d in dir_parts:
if d in important_dirs:
score += 5
return score
async def load_smart_files(
gitlab_url: str,
token: str,
project_id: int,
ref: str,
tree: list[dict],
user_query: str = "",
previous_files: set[str] | None = None,
) -> dict:
"""
Smart file loader for massive codebases.
Returns:
{
"tree_summary": str, # Full file tree (paths only)
"priority_files": [...], # Always-loaded config/entry files
"query_files": [...], # Files relevant to user's question
"total_files_in_tree": int,
"files_loaded": int,
"total_characters": int,
"skipped_large": int,
}
"""
if previous_files is None:
previous_files = set()
# Separate text files from binary
text_files = []
for item in tree:
if item["type"] != "blob":
continue
if _is_text_file(item["path"]):
text_files.append(item["path"])
# Split into priority and regular files
priority_paths = []
regular_paths = []
for fp in text_files:
if _is_priority_file(fp):
priority_paths.append(fp)
else:
regular_paths.append(fp)
# Score regular files by relevance to user query
scored = []
for fp in regular_paths:
s = _score_file_for_query(fp, user_query, previous_files)
scored.append((s, fp))
# Sort by score descending
scored.sort(key=lambda x: -x[0])
# Load priority files first
priority_loaded = []
priority_chars = 0
skipped_large = 0
for fp in priority_paths[:100]: # Cap at 100 priority files
if priority_chars >= MAX_PRIORITY_CHARS:
break
try:
raw = await get_file_raw(gitlab_url, token, project_id, fp, ref=ref)
if len(raw) > MAX_SINGLE_FILE:
raw = raw[:MAX_SINGLE_FILE] + f"\n\n... [truncated — file is {len(raw)} chars]"
skipped_large += 1
priority_loaded.append({"path": fp, "content": raw})
priority_chars += len(raw)
except Exception:
priority_loaded.append({"path": fp, "content": "[Could not read file]"})
# Load query-relevant files
query_loaded = []
query_chars = 0
loaded_paths = {f["path"] for f in priority_loaded}
files_loaded_count = len(priority_loaded)
for score, fp in scored:
if files_loaded_count >= MAX_FILES_TO_LOAD:
break
if query_chars >= MAX_QUERY_CHARS:
break
if fp in loaded_paths:
continue
# Only load scored files (score > 0) OR if we have room and few files loaded
if score <= 0 and files_loaded_count > 50:
break
try:
raw = await get_file_raw(gitlab_url, token, project_id, fp, ref=ref)
if len(raw) > MAX_SINGLE_FILE:
raw = raw[:MAX_SINGLE_FILE] + f"\n\n... [truncated — file is {len(raw)} chars]"
skipped_large += 1
query_loaded.append({"path": fp, "content": raw, "score": score})
query_chars += len(raw)
loaded_paths.add(fp)
files_loaded_count += 1
except Exception:
query_loaded.append({"path": fp, "content": "[Could not read file]", "score": score})
files_loaded_count += 1
return {
"priority_files": priority_loaded,
"query_files": query_loaded,
"total_files_in_tree": len(text_files),
"total_binary_files": len(tree) - len(text_files),
"files_loaded": files_loaded_count,
"priority_chars": priority_chars,
"query_chars": query_chars,
"total_characters": priority_chars + query_chars,
"skipped_large": skipped_large,
}
# ═══════════════════════════════════════════════════
# Legacy bulk loader (used by /analyze endpoint)
# ═══════════════════════════════════════════════════
MAX_ANALYSIS_FILES = 200
MAX_ANALYSIS_CHARS = 600_000
async def load_project_files(
......@@ -354,12 +603,14 @@ async def load_project_files(
for item in tree:
if item["type"] != "blob":
continue
ext = ""
name = item["path"].lower()
if "." in name:
ext = "." + name.rsplit(".", 1)[-1]
if ext in TEXT_EXTENSIONS or name in {"dockerfile", "makefile", "gemfile", "rakefile"}:
if _is_text_file(item["path"]):
text_files.append(item["path"])
# Sort: priority files first, then alphabetical
def sort_key(fp):
return (0 if _is_priority_file(fp) else 1, fp)
text_files.sort(key=sort_key)
text_files = text_files[:MAX_ANALYSIS_FILES]
contents = []
......@@ -369,6 +620,8 @@ async def load_project_files(
break
try:
raw = await get_file_raw(gitlab_url, token, project_id, fp, ref=ref)
if len(raw) > MAX_SINGLE_FILE:
raw = raw[:MAX_SINGLE_FILE] + "\n... [truncated]"
if total_chars + len(raw) > MAX_ANALYSIS_CHARS:
raw = raw[:MAX_ANALYSIS_CHARS - total_chars] + "\n... [truncated]"
contents.append({"path": fp, "content": raw})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment