Spaces:
Running
Running
Commit
·
42f5b98
1
Parent(s):
1e19152
Update to v0.1.2
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +0 -7
- coderag/__init__.py +3 -0
- coderag/__pycache__/__init__.cpython-313.pyc +0 -0
- coderag/__pycache__/cli.cpython-313.pyc +0 -0
- coderag/__pycache__/config.cpython-313.pyc +0 -0
- coderag/__pycache__/logging.cpython-313.pyc +0 -0
- coderag/__pycache__/main.cpython-313.pyc +0 -0
- coderag/api/__init__.py +5 -0
- coderag/api/__pycache__/__init__.cpython-313.pyc +0 -0
- coderag/api/__pycache__/routes.cpython-313.pyc +0 -0
- coderag/api/__pycache__/schemas.cpython-313.pyc +0 -0
- coderag/api/routes.py +310 -0
- coderag/api/schemas.py +101 -0
- coderag/cli.py +675 -0
- coderag/config.py +154 -0
- coderag/generation/__init__.py +7 -0
- coderag/generation/__pycache__/__init__.cpython-313.pyc +0 -0
- coderag/generation/__pycache__/citations.cpython-313.pyc +0 -0
- coderag/generation/__pycache__/generator.cpython-313.pyc +0 -0
- coderag/generation/__pycache__/prompts.cpython-313.pyc +0 -0
- coderag/generation/citations.py +77 -0
- coderag/generation/generator.py +241 -0
- coderag/generation/prompts.py +72 -0
- coderag/indexing/__init__.py +6 -0
- coderag/indexing/__pycache__/__init__.cpython-313.pyc +0 -0
- coderag/indexing/__pycache__/embeddings.cpython-313.pyc +0 -0
- coderag/indexing/__pycache__/vectorstore.cpython-313.pyc +0 -0
- coderag/indexing/embeddings.py +147 -0
- coderag/indexing/vectorstore.py +170 -0
- coderag/ingestion/__init__.py +8 -0
- coderag/ingestion/__pycache__/__init__.cpython-313.pyc +0 -0
- coderag/ingestion/__pycache__/chunker.cpython-313.pyc +0 -0
- coderag/ingestion/__pycache__/filter.cpython-313.pyc +0 -0
- coderag/ingestion/__pycache__/loader.cpython-313.pyc +0 -0
- coderag/ingestion/__pycache__/validator.cpython-313.pyc +0 -0
- coderag/ingestion/chunker.py +184 -0
- coderag/ingestion/filter.py +85 -0
- coderag/ingestion/loader.py +117 -0
- coderag/ingestion/validator.py +98 -0
- coderag/logging.py +111 -0
- coderag/main.py +128 -0
- coderag/mcp/__init__.py +11 -0
- coderag/mcp/__pycache__/__init__.cpython-313.pyc +0 -0
- coderag/mcp/__pycache__/cli.cpython-313.pyc +0 -0
- coderag/mcp/__pycache__/handlers.cpython-313.pyc +0 -0
- coderag/mcp/__pycache__/prompts.cpython-313.pyc +0 -0
- coderag/mcp/__pycache__/resources.cpython-313.pyc +0 -0
- coderag/mcp/__pycache__/server.cpython-313.pyc +0 -0
- coderag/mcp/__pycache__/tools.cpython-313.pyc +0 -0
- coderag/mcp/cli.py +37 -0
app.py
CHANGED
|
@@ -5,13 +5,6 @@ It's configured to work without GPU (embeddings on CPU, LLM via Groq).
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os
|
| 8 |
-
import sys
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
|
| 11 |
-
# Add src to path for HF Spaces (no pip install -e . available)
|
| 12 |
-
src_path = Path(__file__).parent / "src"
|
| 13 |
-
if src_path.exists():
|
| 14 |
-
sys.path.insert(0, str(src_path))
|
| 15 |
|
| 16 |
# Configure for HF Spaces environment
|
| 17 |
os.environ.setdefault("MODEL_LLM_PROVIDER", "groq")
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Configure for HF Spaces environment
|
| 10 |
os.environ.setdefault("MODEL_LLM_PROVIDER", "groq")
|
coderag/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CodeRAG: RAG-based Q&A system for code repositories with verifiable citations."""
|
| 2 |
+
|
| 3 |
+
__version__ = "0.1.0"
|
coderag/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (279 Bytes). View file
|
|
|
coderag/__pycache__/cli.cpython-313.pyc
ADDED
|
Binary file (33.6 kB). View file
|
|
|
coderag/__pycache__/config.cpython-313.pyc
ADDED
|
Binary file (6.4 kB). View file
|
|
|
coderag/__pycache__/logging.cpython-313.pyc
ADDED
|
Binary file (5.03 kB). View file
|
|
|
coderag/__pycache__/main.cpython-313.pyc
ADDED
|
Binary file (5.61 kB). View file
|
|
|
coderag/api/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""API module: REST endpoints for programmatic access."""
|
| 2 |
+
|
| 3 |
+
from coderag.api.routes import router
|
| 4 |
+
|
| 5 |
+
__all__ = ["router"]
|
coderag/api/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (309 Bytes). View file
|
|
|
coderag/api/__pycache__/routes.cpython-313.pyc
ADDED
|
Binary file (12.9 kB). View file
|
|
|
coderag/api/__pycache__/schemas.cpython-313.pyc
ADDED
|
Binary file (5.38 kB). View file
|
|
|
coderag/api/routes.py
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""REST API routes."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
| 8 |
+
from fastapi.responses import JSONResponse
|
| 9 |
+
|
| 10 |
+
from coderag.api.schemas import (
|
| 11 |
+
IndexRepositoryRequest,
|
| 12 |
+
IndexRepositoryResponse,
|
| 13 |
+
QueryRequest,
|
| 14 |
+
QueryResponse,
|
| 15 |
+
ListRepositoriesResponse,
|
| 16 |
+
RepositoryInfo,
|
| 17 |
+
CitationResponse,
|
| 18 |
+
RetrievedChunkResponse,
|
| 19 |
+
ErrorResponse,
|
| 20 |
+
)
|
| 21 |
+
from coderag.config import get_settings
|
| 22 |
+
from coderag.generation.generator import ResponseGenerator
|
| 23 |
+
from coderag.indexing.embeddings import EmbeddingGenerator
|
| 24 |
+
from coderag.indexing.vectorstore import VectorStore
|
| 25 |
+
from coderag.ingestion.chunker import CodeChunker
|
| 26 |
+
from coderag.ingestion.filter import FileFilter
|
| 27 |
+
from coderag.ingestion.loader import RepositoryLoader
|
| 28 |
+
from coderag.ingestion.validator import GitHubURLValidator, ValidationError
|
| 29 |
+
from coderag.logging import get_logger
|
| 30 |
+
from coderag.models.document import Document
|
| 31 |
+
from coderag.models.query import Query as QueryModel
|
| 32 |
+
from coderag.models.repository import Repository, RepositoryStatus
|
| 33 |
+
|
| 34 |
+
logger = get_logger(__name__)
|
| 35 |
+
router = APIRouter()
|
| 36 |
+
|
| 37 |
+
# Global state (in production, use a proper database)
|
| 38 |
+
settings = get_settings()
|
| 39 |
+
repos_file = settings.data_dir / "repositories.json"
|
| 40 |
+
repositories: dict[str, Repository] = {}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def load_repositories() -> None:
|
| 44 |
+
"""Load repositories from disk."""
|
| 45 |
+
global repositories
|
| 46 |
+
if repos_file.exists():
|
| 47 |
+
try:
|
| 48 |
+
data = json.loads(repos_file.read_text())
|
| 49 |
+
repositories = {r["id"]: Repository.from_dict(r) for r in data}
|
| 50 |
+
except Exception as e:
|
| 51 |
+
logger.error("Failed to load repositories", error=str(e))
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def save_repositories() -> None:
|
| 55 |
+
"""Save repositories to disk."""
|
| 56 |
+
repos_file.parent.mkdir(parents=True, exist_ok=True)
|
| 57 |
+
data = [r.to_dict() for r in repositories.values()]
|
| 58 |
+
repos_file.write_text(json.dumps(data, indent=2))
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# Load on startup
|
| 62 |
+
load_repositories()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def resolve_repo_id(partial_id: str) -> Optional[str]:
|
| 66 |
+
"""Resolve a partial repository ID to a full ID.
|
| 67 |
+
|
| 68 |
+
Supports both full UUIDs and partial IDs (first 8+ characters).
|
| 69 |
+
Returns None if no match or multiple matches found.
|
| 70 |
+
"""
|
| 71 |
+
# First try exact match
|
| 72 |
+
if partial_id in repositories:
|
| 73 |
+
return partial_id
|
| 74 |
+
|
| 75 |
+
# Try prefix match (minimum 8 characters recommended)
|
| 76 |
+
matches = [rid for rid in repositories.keys() if rid.startswith(partial_id)]
|
| 77 |
+
|
| 78 |
+
if len(matches) == 1:
|
| 79 |
+
return matches[0]
|
| 80 |
+
|
| 81 |
+
return None
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_repo_or_404(repo_id: str) -> Repository:
|
| 85 |
+
"""Get a repository by ID (full or partial), raising 404 if not found."""
|
| 86 |
+
full_id = resolve_repo_id(repo_id)
|
| 87 |
+
if full_id is None:
|
| 88 |
+
raise HTTPException(status_code=404, detail="Repository not found")
|
| 89 |
+
return repositories[full_id]
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
async def index_repository_task(
|
| 93 |
+
url: str,
|
| 94 |
+
repo_id: str,
|
| 95 |
+
branch: Optional[str],
|
| 96 |
+
include_patterns: Optional[list[str]],
|
| 97 |
+
exclude_patterns: Optional[list[str]],
|
| 98 |
+
) -> None:
|
| 99 |
+
"""Background task to index a repository."""
|
| 100 |
+
repo = repositories[repo_id]
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
# Validate and clone
|
| 104 |
+
validator = GitHubURLValidator()
|
| 105 |
+
repo_info = await validator.validate_repository(url)
|
| 106 |
+
branch = branch or repo_info.branch or "main"
|
| 107 |
+
|
| 108 |
+
loader = RepositoryLoader()
|
| 109 |
+
repo_path = loader.clone_repository(repo_info, branch)
|
| 110 |
+
|
| 111 |
+
repo.clone_path = repo_path
|
| 112 |
+
repo.status = RepositoryStatus.INDEXING
|
| 113 |
+
save_repositories()
|
| 114 |
+
|
| 115 |
+
# Filter files
|
| 116 |
+
file_filter = FileFilter(
|
| 117 |
+
include_patterns=include_patterns,
|
| 118 |
+
exclude_patterns=exclude_patterns,
|
| 119 |
+
)
|
| 120 |
+
files = list(file_filter.filter_files(repo_path))
|
| 121 |
+
|
| 122 |
+
# Load documents
|
| 123 |
+
documents = []
|
| 124 |
+
for file_path in files:
|
| 125 |
+
try:
|
| 126 |
+
doc = Document.from_file(file_path, repo_path, repo.id)
|
| 127 |
+
documents.append(doc)
|
| 128 |
+
except Exception as e:
|
| 129 |
+
logger.warning("Failed to load file", path=str(file_path), error=str(e))
|
| 130 |
+
|
| 131 |
+
# Chunk
|
| 132 |
+
chunker = CodeChunker()
|
| 133 |
+
chunks = []
|
| 134 |
+
for doc in documents:
|
| 135 |
+
for chunk in chunker.chunk_document(doc):
|
| 136 |
+
chunks.append(chunk)
|
| 137 |
+
|
| 138 |
+
# Embed and store
|
| 139 |
+
if chunks:
|
| 140 |
+
vectorstore = VectorStore()
|
| 141 |
+
vectorstore.delete_repo_chunks(repo.id)
|
| 142 |
+
|
| 143 |
+
embedder = EmbeddingGenerator()
|
| 144 |
+
embedded_chunks = embedder.embed_chunks(chunks)
|
| 145 |
+
vectorstore.add_chunks(embedded_chunks)
|
| 146 |
+
|
| 147 |
+
# Update status
|
| 148 |
+
repo.chunk_count = len(chunks)
|
| 149 |
+
repo.indexed_at = datetime.now()
|
| 150 |
+
repo.status = RepositoryStatus.READY
|
| 151 |
+
save_repositories()
|
| 152 |
+
|
| 153 |
+
logger.info("Repository indexed", repo_id=repo_id, chunks=len(chunks))
|
| 154 |
+
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error("Indexing failed", repo_id=repo_id, error=str(e))
|
| 157 |
+
repo.status = RepositoryStatus.ERROR
|
| 158 |
+
repo.error_message = str(e)
|
| 159 |
+
save_repositories()
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
@router.post("/repos/index", response_model=IndexRepositoryResponse, status_code=202)
|
| 163 |
+
async def index_repository(
|
| 164 |
+
request: IndexRepositoryRequest,
|
| 165 |
+
background_tasks: BackgroundTasks,
|
| 166 |
+
) -> IndexRepositoryResponse:
|
| 167 |
+
"""Index a GitHub repository."""
|
| 168 |
+
# Create repository record
|
| 169 |
+
repo = Repository(
|
| 170 |
+
url=request.url,
|
| 171 |
+
branch=request.branch or "main",
|
| 172 |
+
status=RepositoryStatus.PENDING,
|
| 173 |
+
)
|
| 174 |
+
repositories[repo.id] = repo
|
| 175 |
+
save_repositories()
|
| 176 |
+
|
| 177 |
+
# Start background indexing
|
| 178 |
+
background_tasks.add_task(
|
| 179 |
+
index_repository_task,
|
| 180 |
+
request.url,
|
| 181 |
+
repo.id,
|
| 182 |
+
request.branch,
|
| 183 |
+
request.include_patterns,
|
| 184 |
+
request.exclude_patterns,
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
return IndexRepositoryResponse(
|
| 188 |
+
repo_id=repo.id,
|
| 189 |
+
status=repo.status.value,
|
| 190 |
+
message="Repository indexing started",
|
| 191 |
+
)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
@router.post("/query", response_model=QueryResponse)
|
| 195 |
+
async def query_repository(request: QueryRequest) -> QueryResponse:
|
| 196 |
+
"""Query a repository.
|
| 197 |
+
|
| 198 |
+
Supports both full repository IDs and partial IDs (first 8+ characters).
|
| 199 |
+
"""
|
| 200 |
+
# Check repository exists (supports partial IDs)
|
| 201 |
+
repo = get_repo_or_404(request.repo_id)
|
| 202 |
+
|
| 203 |
+
if repo.status != RepositoryStatus.READY:
|
| 204 |
+
raise HTTPException(
|
| 205 |
+
status_code=400,
|
| 206 |
+
detail=f"Repository not ready (status: {repo.status.value})",
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
try:
|
| 210 |
+
# Generate response (use resolved repo.id for consistency)
|
| 211 |
+
generator = ResponseGenerator()
|
| 212 |
+
query = QueryModel(
|
| 213 |
+
question=request.question,
|
| 214 |
+
repo_id=repo.id, # Use resolved full ID
|
| 215 |
+
top_k=request.top_k,
|
| 216 |
+
)
|
| 217 |
+
response = generator.generate(query)
|
| 218 |
+
|
| 219 |
+
# Convert to API schema
|
| 220 |
+
return QueryResponse(
|
| 221 |
+
answer=response.answer,
|
| 222 |
+
citations=[
|
| 223 |
+
CitationResponse(
|
| 224 |
+
file_path=c.file_path,
|
| 225 |
+
start_line=c.start_line,
|
| 226 |
+
end_line=c.end_line,
|
| 227 |
+
)
|
| 228 |
+
for c in response.citations
|
| 229 |
+
],
|
| 230 |
+
retrieved_chunks=[
|
| 231 |
+
RetrievedChunkResponse(
|
| 232 |
+
chunk_id=c.chunk_id,
|
| 233 |
+
file_path=c.file_path,
|
| 234 |
+
start_line=c.start_line,
|
| 235 |
+
end_line=c.end_line,
|
| 236 |
+
relevance_score=c.relevance_score,
|
| 237 |
+
chunk_type=c.chunk_type,
|
| 238 |
+
name=c.name,
|
| 239 |
+
content=c.content,
|
| 240 |
+
)
|
| 241 |
+
for c in response.retrieved_chunks
|
| 242 |
+
],
|
| 243 |
+
grounded=response.grounded,
|
| 244 |
+
query_id=response.query_id,
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
except Exception as e:
|
| 248 |
+
logger.error("Query failed", error=str(e))
|
| 249 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
@router.get("/repos", response_model=ListRepositoriesResponse)
|
| 253 |
+
async def list_repositories() -> ListRepositoriesResponse:
|
| 254 |
+
"""List all repositories."""
|
| 255 |
+
return ListRepositoriesResponse(
|
| 256 |
+
repositories=[
|
| 257 |
+
RepositoryInfo(
|
| 258 |
+
id=repo.id,
|
| 259 |
+
url=repo.url,
|
| 260 |
+
branch=repo.branch,
|
| 261 |
+
chunk_count=repo.chunk_count,
|
| 262 |
+
status=repo.status.value,
|
| 263 |
+
indexed_at=repo.indexed_at,
|
| 264 |
+
error_message=repo.error_message,
|
| 265 |
+
)
|
| 266 |
+
for repo in repositories.values()
|
| 267 |
+
]
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
@router.get("/repos/{repo_id}", response_model=RepositoryInfo)
|
| 272 |
+
async def get_repository(repo_id: str) -> RepositoryInfo:
|
| 273 |
+
"""Get repository details.
|
| 274 |
+
|
| 275 |
+
Supports both full repository IDs and partial IDs (first 8+ characters).
|
| 276 |
+
"""
|
| 277 |
+
repo = get_repo_or_404(repo_id)
|
| 278 |
+
return RepositoryInfo(
|
| 279 |
+
id=repo.id,
|
| 280 |
+
url=repo.url,
|
| 281 |
+
branch=repo.branch,
|
| 282 |
+
chunk_count=repo.chunk_count,
|
| 283 |
+
status=repo.status.value,
|
| 284 |
+
indexed_at=repo.indexed_at,
|
| 285 |
+
error_message=repo.error_message,
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
@router.delete("/repos/{repo_id}")
|
| 290 |
+
async def delete_repository(repo_id: str) -> dict:
|
| 291 |
+
"""Delete a repository.
|
| 292 |
+
|
| 293 |
+
Supports both full repository IDs and partial IDs (first 8+ characters).
|
| 294 |
+
"""
|
| 295 |
+
repo = get_repo_or_404(repo_id)
|
| 296 |
+
|
| 297 |
+
try:
|
| 298 |
+
# Delete from vector store (use resolved full ID)
|
| 299 |
+
vectorstore = VectorStore()
|
| 300 |
+
vectorstore.delete_repo_chunks(repo.id)
|
| 301 |
+
|
| 302 |
+
# Delete from records (use resolved full ID)
|
| 303 |
+
del repositories[repo.id]
|
| 304 |
+
save_repositories()
|
| 305 |
+
|
| 306 |
+
return {"message": f"Repository {repo.full_name} deleted"}
|
| 307 |
+
|
| 308 |
+
except Exception as e:
|
| 309 |
+
logger.error("Delete failed", error=str(e))
|
| 310 |
+
raise HTTPException(status_code=500, detail=str(e))
|
coderag/api/schemas.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schemas for REST API."""
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, Field, HttpUrl
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class IndexRepositoryRequest(BaseModel):
|
| 10 |
+
"""Request to index a repository."""
|
| 11 |
+
|
| 12 |
+
url: str = Field(..., description="GitHub repository URL")
|
| 13 |
+
branch: Optional[str] = Field(None, description="Branch name (default: main)")
|
| 14 |
+
include_patterns: Optional[list[str]] = Field(None, description="File patterns to include")
|
| 15 |
+
exclude_patterns: Optional[list[str]] = Field(None, description="File patterns to exclude")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class IndexRepositoryResponse(BaseModel):
|
| 19 |
+
"""Response from indexing request."""
|
| 20 |
+
|
| 21 |
+
repo_id: str = Field(..., description="Repository ID")
|
| 22 |
+
status: str = Field(..., description="Indexing status")
|
| 23 |
+
message: str = Field(..., description="Status message")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class QueryRequest(BaseModel):
|
| 27 |
+
"""Request to query a repository."""
|
| 28 |
+
|
| 29 |
+
question: str = Field(..., description="Question about the repository")
|
| 30 |
+
repo_id: str = Field(..., description="Repository ID to query")
|
| 31 |
+
top_k: int = Field(5, ge=1, le=20, description="Number of chunks to retrieve")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class CitationResponse(BaseModel):
|
| 35 |
+
"""Citation information."""
|
| 36 |
+
|
| 37 |
+
file_path: str
|
| 38 |
+
start_line: int
|
| 39 |
+
end_line: int
|
| 40 |
+
|
| 41 |
+
class Config:
|
| 42 |
+
from_attributes = True
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class RetrievedChunkResponse(BaseModel):
|
| 46 |
+
"""Retrieved chunk information."""
|
| 47 |
+
|
| 48 |
+
chunk_id: str
|
| 49 |
+
file_path: str
|
| 50 |
+
start_line: int
|
| 51 |
+
end_line: int
|
| 52 |
+
relevance_score: float
|
| 53 |
+
chunk_type: str
|
| 54 |
+
name: Optional[str] = None
|
| 55 |
+
content: str
|
| 56 |
+
|
| 57 |
+
class Config:
|
| 58 |
+
from_attributes = True
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class QueryResponse(BaseModel):
|
| 62 |
+
"""Response from a query."""
|
| 63 |
+
|
| 64 |
+
answer: str = Field(..., description="Generated answer")
|
| 65 |
+
citations: list[CitationResponse] = Field(..., description="Citations in the answer")
|
| 66 |
+
retrieved_chunks: list[RetrievedChunkResponse] = Field(..., description="Evidence chunks")
|
| 67 |
+
grounded: bool = Field(..., description="Whether response is grounded in evidence")
|
| 68 |
+
query_id: str = Field(..., description="Query ID")
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class RepositoryInfo(BaseModel):
|
| 72 |
+
"""Repository information."""
|
| 73 |
+
|
| 74 |
+
id: str
|
| 75 |
+
url: str
|
| 76 |
+
branch: str
|
| 77 |
+
chunk_count: int
|
| 78 |
+
status: str
|
| 79 |
+
indexed_at: Optional[datetime] = None
|
| 80 |
+
error_message: Optional[str] = None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class ListRepositoriesResponse(BaseModel):
|
| 84 |
+
"""List of repositories."""
|
| 85 |
+
|
| 86 |
+
repositories: list[RepositoryInfo]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class HealthResponse(BaseModel):
|
| 90 |
+
"""Health check response."""
|
| 91 |
+
|
| 92 |
+
status: str
|
| 93 |
+
app: str
|
| 94 |
+
version: str
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class ErrorResponse(BaseModel):
|
| 98 |
+
"""Error response."""
|
| 99 |
+
|
| 100 |
+
error: str
|
| 101 |
+
detail: Optional[str] = None
|
coderag/cli.py
ADDED
|
@@ -0,0 +1,675 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unified CLI for CodeRAG."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import platform
|
| 6 |
+
import shutil
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
import click
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Config directory and file
|
| 15 |
+
CONFIG_DIR = Path.home() / ".config" / "coderag"
|
| 16 |
+
CONFIG_FILE = CONFIG_DIR / "config.json"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def get_config() -> dict:
|
| 20 |
+
"""Load configuration from config file."""
|
| 21 |
+
if CONFIG_FILE.exists():
|
| 22 |
+
try:
|
| 23 |
+
return json.loads(CONFIG_FILE.read_text())
|
| 24 |
+
except Exception:
|
| 25 |
+
return {}
|
| 26 |
+
return {}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def save_config(config: dict) -> None:
|
| 30 |
+
"""Save configuration to config file."""
|
| 31 |
+
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
| 32 |
+
CONFIG_FILE.write_text(json.dumps(config, indent=2))
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def get_claude_config_path() -> Optional[Path]:
|
| 36 |
+
"""Get Claude Desktop config path based on OS."""
|
| 37 |
+
system = platform.system()
|
| 38 |
+
|
| 39 |
+
if system == "Darwin": # macOS
|
| 40 |
+
return Path.home() / "Library" / "Application Support" / "Claude" / "claude_desktop_config.json"
|
| 41 |
+
elif system == "Linux":
|
| 42 |
+
return Path.home() / ".config" / "Claude" / "claude_desktop_config.json"
|
| 43 |
+
elif system == "Windows":
|
| 44 |
+
appdata = os.environ.get("APPDATA", "")
|
| 45 |
+
if appdata:
|
| 46 |
+
return Path(appdata) / "Claude" / "claude_desktop_config.json"
|
| 47 |
+
return None
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@click.group()
|
| 51 |
+
@click.version_option(package_name="coderag")
|
| 52 |
+
def cli():
|
| 53 |
+
"""CodeRAG - RAG-based Q&A system for code repositories.
|
| 54 |
+
|
| 55 |
+
Use 'coderag setup' to configure, then 'coderag serve' to start.
|
| 56 |
+
For Claude Desktop integration, run 'coderag mcp-install'.
|
| 57 |
+
"""
|
| 58 |
+
pass
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@cli.command()
|
| 62 |
+
@click.option("--provider", type=click.Choice(["groq", "openai", "anthropic", "openrouter", "together", "local"]),
|
| 63 |
+
default=None, help="LLM provider to use")
|
| 64 |
+
@click.option("--api-key", default=None, help="API key for the provider")
|
| 65 |
+
def setup(provider: Optional[str], api_key: Optional[str]):
|
| 66 |
+
"""Interactive setup wizard for CodeRAG.
|
| 67 |
+
|
| 68 |
+
Configures the LLM provider and API key. Configuration is saved to
|
| 69 |
+
~/.config/coderag/config.json and can be overridden by environment variables.
|
| 70 |
+
"""
|
| 71 |
+
config = get_config()
|
| 72 |
+
|
| 73 |
+
click.echo("\n🔧 CodeRAG Setup\n")
|
| 74 |
+
|
| 75 |
+
# Provider selection
|
| 76 |
+
if provider is None:
|
| 77 |
+
click.echo("Select your LLM provider:")
|
| 78 |
+
click.echo(" 1. groq (FREE, fast - recommended)")
|
| 79 |
+
click.echo(" 2. openai")
|
| 80 |
+
click.echo(" 3. anthropic")
|
| 81 |
+
click.echo(" 4. openrouter")
|
| 82 |
+
click.echo(" 5. together")
|
| 83 |
+
click.echo(" 6. local (requires GPU)")
|
| 84 |
+
|
| 85 |
+
choice = click.prompt("Enter choice", type=int, default=1)
|
| 86 |
+
providers = {1: "groq", 2: "openai", 3: "anthropic", 4: "openrouter", 5: "together", 6: "local"}
|
| 87 |
+
provider = providers.get(choice, "groq")
|
| 88 |
+
|
| 89 |
+
config["llm_provider"] = provider
|
| 90 |
+
|
| 91 |
+
# API key (not needed for local)
|
| 92 |
+
if provider != "local":
|
| 93 |
+
if api_key is None:
|
| 94 |
+
api_key_urls = {
|
| 95 |
+
"groq": "https://console.groq.com/keys",
|
| 96 |
+
"openai": "https://platform.openai.com/api-keys",
|
| 97 |
+
"anthropic": "https://console.anthropic.com/settings/keys",
|
| 98 |
+
"openrouter": "https://openrouter.ai/keys",
|
| 99 |
+
"together": "https://api.together.xyz/settings/api-keys",
|
| 100 |
+
}
|
| 101 |
+
url = api_key_urls.get(provider, "")
|
| 102 |
+
if url:
|
| 103 |
+
click.echo(f"\nGet your API key from: {url}")
|
| 104 |
+
|
| 105 |
+
api_key = click.prompt("Enter your API key", hide_input=True)
|
| 106 |
+
|
| 107 |
+
config["llm_api_key"] = api_key
|
| 108 |
+
|
| 109 |
+
# Validate API key
|
| 110 |
+
click.echo("\n⏳ Validating API key...")
|
| 111 |
+
if _validate_api_key(provider, api_key):
|
| 112 |
+
click.echo("✅ API key is valid!")
|
| 113 |
+
else:
|
| 114 |
+
click.echo("⚠️ Could not validate API key. It may still work.")
|
| 115 |
+
else:
|
| 116 |
+
click.echo("\n⚠️ Local mode requires a CUDA-capable GPU.")
|
| 117 |
+
|
| 118 |
+
# Save config
|
| 119 |
+
save_config(config)
|
| 120 |
+
click.echo(f"\n✅ Configuration saved to {CONFIG_FILE}")
|
| 121 |
+
|
| 122 |
+
# Next steps
|
| 123 |
+
click.echo("\n📋 Next steps:")
|
| 124 |
+
click.echo(" 1. Run 'coderag serve' to start the web interface")
|
| 125 |
+
click.echo(" 2. Run 'coderag mcp-install' to integrate with Claude Desktop")
|
| 126 |
+
click.echo(" 3. Run 'coderag index <url>' to index a repository")
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _validate_api_key(provider: str, api_key: str) -> bool:
|
| 130 |
+
"""Validate API key by making a test request."""
|
| 131 |
+
try:
|
| 132 |
+
from openai import OpenAI
|
| 133 |
+
|
| 134 |
+
base_urls = {
|
| 135 |
+
"groq": "https://api.groq.com/openai/v1",
|
| 136 |
+
"openai": "https://api.openai.com/v1",
|
| 137 |
+
"openrouter": "https://openrouter.ai/api/v1",
|
| 138 |
+
"together": "https://api.together.xyz/v1",
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
if provider not in base_urls:
|
| 142 |
+
return True # Can't validate, assume OK
|
| 143 |
+
|
| 144 |
+
client = OpenAI(api_key=api_key, base_url=base_urls[provider])
|
| 145 |
+
client.models.list()
|
| 146 |
+
return True
|
| 147 |
+
except Exception:
|
| 148 |
+
return False
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@cli.command()
|
| 152 |
+
@click.option("--host", default="0.0.0.0", help="Host to bind to")
|
| 153 |
+
@click.option("--port", default=8000, type=int, help="Port to bind to")
|
| 154 |
+
@click.option("--reload", is_flag=True, help="Enable auto-reload for development")
|
| 155 |
+
def serve(host: str, port: int, reload: bool):
|
| 156 |
+
"""Start the CodeRAG web server.
|
| 157 |
+
|
| 158 |
+
Starts the FastAPI server with Gradio UI, REST API, and MCP endpoint.
|
| 159 |
+
"""
|
| 160 |
+
# Apply config from file to environment
|
| 161 |
+
_apply_config_to_env()
|
| 162 |
+
|
| 163 |
+
import uvicorn
|
| 164 |
+
from coderag.main import create_app
|
| 165 |
+
from coderag.config import get_settings
|
| 166 |
+
|
| 167 |
+
settings = get_settings()
|
| 168 |
+
app = create_app()
|
| 169 |
+
|
| 170 |
+
click.echo(f"\n🚀 Starting CodeRAG server at http://{host}:{port}")
|
| 171 |
+
click.echo(" Press Ctrl+C to stop\n")
|
| 172 |
+
|
| 173 |
+
uvicorn.run(
|
| 174 |
+
app,
|
| 175 |
+
host=host,
|
| 176 |
+
port=port,
|
| 177 |
+
reload=reload,
|
| 178 |
+
log_level=settings.server.log_level,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
@cli.command("mcp-run")
|
| 183 |
+
def mcp_run():
|
| 184 |
+
"""Run MCP server in stdio mode (for Claude Desktop).
|
| 185 |
+
|
| 186 |
+
This command is used by Claude Desktop to communicate with CodeRAG.
|
| 187 |
+
You typically don't need to run this manually.
|
| 188 |
+
"""
|
| 189 |
+
# Apply config from file to environment
|
| 190 |
+
_apply_config_to_env()
|
| 191 |
+
|
| 192 |
+
# Suppress all output except MCP protocol
|
| 193 |
+
import logging
|
| 194 |
+
logging.basicConfig(level=logging.WARNING, stream=sys.stderr)
|
| 195 |
+
|
| 196 |
+
import structlog
|
| 197 |
+
structlog.configure(
|
| 198 |
+
wrapper_class=structlog.make_filtering_bound_logger(logging.CRITICAL),
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
from coderag.mcp.server import create_mcp_server
|
| 202 |
+
mcp = create_mcp_server()
|
| 203 |
+
mcp.run(transport="stdio")
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
@cli.command("mcp-install")
|
| 207 |
+
@click.option("--dry-run", is_flag=True, help="Preview changes without applying")
|
| 208 |
+
def mcp_install(dry_run: bool):
|
| 209 |
+
"""Configure Claude Desktop to use CodeRAG MCP.
|
| 210 |
+
|
| 211 |
+
Automatically detects your OS and updates the Claude Desktop configuration
|
| 212 |
+
to include the CodeRAG MCP server.
|
| 213 |
+
"""
|
| 214 |
+
config_path = get_claude_config_path()
|
| 215 |
+
|
| 216 |
+
if config_path is None:
|
| 217 |
+
click.echo("❌ Could not determine Claude Desktop config location.")
|
| 218 |
+
click.echo(" Please manually add the MCP configuration.")
|
| 219 |
+
sys.exit(1)
|
| 220 |
+
|
| 221 |
+
click.echo(f"\n🔍 Claude Desktop config: {config_path}")
|
| 222 |
+
|
| 223 |
+
# Check if Claude Desktop is installed
|
| 224 |
+
if not config_path.parent.exists():
|
| 225 |
+
click.echo("\n❌ Claude Desktop does not appear to be installed.")
|
| 226 |
+
click.echo(" Install it from: https://claude.ai/download")
|
| 227 |
+
sys.exit(1)
|
| 228 |
+
|
| 229 |
+
# Load existing config or create new
|
| 230 |
+
if config_path.exists():
|
| 231 |
+
try:
|
| 232 |
+
config = json.loads(config_path.read_text())
|
| 233 |
+
except json.JSONDecodeError:
|
| 234 |
+
click.echo("⚠️ Existing config is invalid JSON. Creating new config.")
|
| 235 |
+
config = {}
|
| 236 |
+
else:
|
| 237 |
+
config = {}
|
| 238 |
+
|
| 239 |
+
# Ensure mcpServers key exists
|
| 240 |
+
if "mcpServers" not in config:
|
| 241 |
+
config["mcpServers"] = {}
|
| 242 |
+
|
| 243 |
+
# Find the coderag-mcp command path
|
| 244 |
+
coderag_path = shutil.which("coderag")
|
| 245 |
+
if coderag_path is None:
|
| 246 |
+
# Fallback to python -m
|
| 247 |
+
python_path = sys.executable
|
| 248 |
+
mcp_command = [python_path, "-m", "coderag.mcp.cli"]
|
| 249 |
+
else:
|
| 250 |
+
mcp_command = [coderag_path, "mcp-run"]
|
| 251 |
+
|
| 252 |
+
# Prepare MCP server config
|
| 253 |
+
new_mcp_config = {
|
| 254 |
+
"command": mcp_command[0],
|
| 255 |
+
"args": mcp_command[1:] if len(mcp_command) > 1 else [],
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
# Check if already configured
|
| 259 |
+
existing = config["mcpServers"].get("coderag")
|
| 260 |
+
if existing == new_mcp_config:
|
| 261 |
+
click.echo("\n✅ CodeRAG MCP is already configured correctly!")
|
| 262 |
+
return
|
| 263 |
+
|
| 264 |
+
# Show diff
|
| 265 |
+
click.echo("\n📝 Changes to be made:")
|
| 266 |
+
if existing:
|
| 267 |
+
click.echo(f" Update: mcpServers.coderag")
|
| 268 |
+
click.echo(f" From: {json.dumps(existing)}")
|
| 269 |
+
click.echo(f" To: {json.dumps(new_mcp_config)}")
|
| 270 |
+
else:
|
| 271 |
+
click.echo(f" Add: mcpServers.coderag = {json.dumps(new_mcp_config)}")
|
| 272 |
+
|
| 273 |
+
if dry_run:
|
| 274 |
+
click.echo("\n🔍 Dry run - no changes made.")
|
| 275 |
+
return
|
| 276 |
+
|
| 277 |
+
# Backup existing config
|
| 278 |
+
if config_path.exists():
|
| 279 |
+
backup_path = config_path.with_suffix(".json.backup")
|
| 280 |
+
shutil.copy(config_path, backup_path)
|
| 281 |
+
click.echo(f"\n📦 Backup saved to: {backup_path}")
|
| 282 |
+
|
| 283 |
+
# Apply changes
|
| 284 |
+
config["mcpServers"]["coderag"] = new_mcp_config
|
| 285 |
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
| 286 |
+
config_path.write_text(json.dumps(config, indent=2))
|
| 287 |
+
|
| 288 |
+
click.echo("\n✅ Claude Desktop configuration updated!")
|
| 289 |
+
click.echo("\n⚠️ Please restart Claude Desktop to apply changes.")
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
@cli.command("index")
|
| 293 |
+
@click.argument("url")
|
| 294 |
+
@click.option("--branch", default="", help="Branch to index (default: main/master)")
|
| 295 |
+
def index(url: str, branch: str):
|
| 296 |
+
"""Index a GitHub repository.
|
| 297 |
+
|
| 298 |
+
URL: The GitHub repository URL to index.
|
| 299 |
+
|
| 300 |
+
Example: coderag index https://github.com/owner/repo
|
| 301 |
+
"""
|
| 302 |
+
# Apply config from file to environment
|
| 303 |
+
_apply_config_to_env()
|
| 304 |
+
|
| 305 |
+
import asyncio
|
| 306 |
+
from coderag.mcp.handlers import get_mcp_handlers
|
| 307 |
+
|
| 308 |
+
click.echo(f"\n📦 Indexing repository: {url}")
|
| 309 |
+
if branch:
|
| 310 |
+
click.echo(f" Branch: {branch}")
|
| 311 |
+
|
| 312 |
+
handlers = get_mcp_handlers()
|
| 313 |
+
|
| 314 |
+
async def run_index():
|
| 315 |
+
result = await handlers.index_repository(url=url, branch=branch)
|
| 316 |
+
return result
|
| 317 |
+
|
| 318 |
+
result = asyncio.run(run_index())
|
| 319 |
+
|
| 320 |
+
if result.get("success"):
|
| 321 |
+
click.echo(f"\n✅ Repository indexed successfully!")
|
| 322 |
+
click.echo(f" Repo ID: {result['repo_id']}")
|
| 323 |
+
click.echo(f" Name: {result['name']}")
|
| 324 |
+
click.echo(f" Files processed: {result['files_processed']}")
|
| 325 |
+
click.echo(f" Chunks indexed: {result['chunks_indexed']}")
|
| 326 |
+
click.echo(f"\n Use 'coderag query {result['repo_id'][:8]} \"your question\"' to query")
|
| 327 |
+
else:
|
| 328 |
+
click.echo(f"\n❌ Indexing failed: {result.get('error', 'Unknown error')}")
|
| 329 |
+
sys.exit(1)
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
@cli.command("query")
|
| 333 |
+
@click.argument("repo_id")
|
| 334 |
+
@click.argument("question")
|
| 335 |
+
@click.option("--top-k", default=5, type=int, help="Number of chunks to retrieve")
|
| 336 |
+
@click.option("--format", "output_format", type=click.Choice(["text", "json"]), default="text", help="Output format")
|
| 337 |
+
def query(repo_id: str, question: str, top_k: int, output_format: str):
|
| 338 |
+
"""Ask a question about an indexed repository.
|
| 339 |
+
|
| 340 |
+
REPO_ID: Repository ID (full or first 8 characters)
|
| 341 |
+
QUESTION: Your question about the code
|
| 342 |
+
|
| 343 |
+
Example: coderag query abc12345 "How does authentication work?"
|
| 344 |
+
"""
|
| 345 |
+
# Apply config from file to environment
|
| 346 |
+
_apply_config_to_env()
|
| 347 |
+
|
| 348 |
+
import asyncio
|
| 349 |
+
from coderag.mcp.handlers import get_mcp_handlers
|
| 350 |
+
|
| 351 |
+
handlers = get_mcp_handlers()
|
| 352 |
+
|
| 353 |
+
async def run_query():
|
| 354 |
+
result = await handlers.query_code(repo_id=repo_id, question=question, top_k=top_k)
|
| 355 |
+
return result
|
| 356 |
+
|
| 357 |
+
click.echo(f"\n🔍 Querying: {question}\n")
|
| 358 |
+
result = asyncio.run(run_query())
|
| 359 |
+
|
| 360 |
+
if result.get("error"):
|
| 361 |
+
click.echo(f"❌ Error: {result['error']}")
|
| 362 |
+
sys.exit(1)
|
| 363 |
+
|
| 364 |
+
if output_format == "json":
|
| 365 |
+
click.echo(json.dumps(result, indent=2))
|
| 366 |
+
else:
|
| 367 |
+
click.echo("📝 Answer:\n")
|
| 368 |
+
click.echo(result.get("answer", "No answer generated."))
|
| 369 |
+
|
| 370 |
+
if result.get("citations"):
|
| 371 |
+
click.echo("\n📍 Citations:")
|
| 372 |
+
for citation in result["citations"]:
|
| 373 |
+
click.echo(f" {citation}")
|
| 374 |
+
|
| 375 |
+
if result.get("evidence"):
|
| 376 |
+
click.echo("\n📂 Evidence:")
|
| 377 |
+
for chunk in result["evidence"][:3]: # Show top 3
|
| 378 |
+
click.echo(f" - {chunk['file']}:{chunk['start_line']}-{chunk['end_line']} (relevance: {chunk['relevance']})")
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
@cli.command("repos")
|
| 382 |
+
@click.option("--format", "output_format", type=click.Choice(["text", "json"]), default="text", help="Output format")
|
| 383 |
+
def repos(output_format: str):
|
| 384 |
+
"""List all indexed repositories."""
|
| 385 |
+
# Apply config from file to environment
|
| 386 |
+
_apply_config_to_env()
|
| 387 |
+
|
| 388 |
+
import asyncio
|
| 389 |
+
from coderag.mcp.handlers import get_mcp_handlers
|
| 390 |
+
|
| 391 |
+
handlers = get_mcp_handlers()
|
| 392 |
+
|
| 393 |
+
async def run_list():
|
| 394 |
+
result = await handlers.list_repositories()
|
| 395 |
+
return result
|
| 396 |
+
|
| 397 |
+
result = asyncio.run(run_list())
|
| 398 |
+
|
| 399 |
+
if output_format == "json":
|
| 400 |
+
click.echo(json.dumps(result, indent=2))
|
| 401 |
+
else:
|
| 402 |
+
repos_list = result.get("repositories", [])
|
| 403 |
+
if not repos_list:
|
| 404 |
+
click.echo("\n📭 No repositories indexed yet.")
|
| 405 |
+
click.echo(" Run 'coderag index <url>' to index a repository.")
|
| 406 |
+
return
|
| 407 |
+
|
| 408 |
+
click.echo(f"\n📚 Indexed Repositories ({len(repos_list)}):\n")
|
| 409 |
+
for repo in repos_list:
|
| 410 |
+
status_icon = "✅" if repo["status"] == "ready" else "⏳" if repo["status"] == "indexing" else "❌"
|
| 411 |
+
click.echo(f" {status_icon} {repo['id'][:8]} {repo['name']} ({repo['branch']})")
|
| 412 |
+
click.echo(f" Chunks: {repo['chunk_count']} | Indexed: {repo.get('indexed_at', 'N/A')}")
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
@cli.command("update")
|
| 416 |
+
@click.argument("repo_id")
|
| 417 |
+
def update(repo_id: str):
|
| 418 |
+
"""Update an indexed repository with latest changes.
|
| 419 |
+
|
| 420 |
+
REPO_ID: Repository ID (full or first 8 characters)
|
| 421 |
+
|
| 422 |
+
Fetches the latest changes from GitHub and re-indexes only the modified files.
|
| 423 |
+
This is faster than a full re-index for repositories with frequent updates.
|
| 424 |
+
|
| 425 |
+
Example: coderag update abc12345
|
| 426 |
+
"""
|
| 427 |
+
# Apply config from file to environment
|
| 428 |
+
_apply_config_to_env()
|
| 429 |
+
|
| 430 |
+
import asyncio
|
| 431 |
+
from coderag.mcp.handlers import get_mcp_handlers
|
| 432 |
+
|
| 433 |
+
click.echo(f"\n🔄 Updating repository: {repo_id}\n")
|
| 434 |
+
|
| 435 |
+
handlers = get_mcp_handlers()
|
| 436 |
+
|
| 437 |
+
async def run_update():
|
| 438 |
+
result = await handlers.update_repository(repo_id=repo_id)
|
| 439 |
+
return result
|
| 440 |
+
|
| 441 |
+
result = asyncio.run(run_update())
|
| 442 |
+
|
| 443 |
+
if result.get("error"):
|
| 444 |
+
click.echo(f"❌ Error: {result['error']}")
|
| 445 |
+
sys.exit(1)
|
| 446 |
+
|
| 447 |
+
if result.get("message") == "Repository is already up to date":
|
| 448 |
+
click.echo("✅ Repository is already up to date!")
|
| 449 |
+
else:
|
| 450 |
+
click.echo("✅ Repository updated successfully!")
|
| 451 |
+
click.echo(f" Files changed: {result.get('files_changed', 0)}")
|
| 452 |
+
click.echo(f" - Added: {result.get('files_added', 0)}")
|
| 453 |
+
click.echo(f" - Modified: {result.get('files_modified', 0)}")
|
| 454 |
+
click.echo(f" - Deleted: {result.get('files_deleted', 0)}")
|
| 455 |
+
click.echo(f" Chunks added: {result.get('chunks_added', 0)}")
|
| 456 |
+
click.echo(f" Chunks deleted: {result.get('chunks_deleted', 0)}")
|
| 457 |
+
click.echo(f" Total chunks: {result.get('total_chunks', 0)}")
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
@cli.command("delete")
|
| 461 |
+
@click.argument("repo_id")
|
| 462 |
+
@click.option("--force", "-f", is_flag=True, help="Skip confirmation prompt")
|
| 463 |
+
def delete(repo_id: str, force: bool):
|
| 464 |
+
"""Delete an indexed repository.
|
| 465 |
+
|
| 466 |
+
REPO_ID: Repository ID (full or first 8 characters)
|
| 467 |
+
|
| 468 |
+
Removes the repository from the index and deletes all associated chunks
|
| 469 |
+
from the vector store.
|
| 470 |
+
|
| 471 |
+
Example: coderag delete abc12345
|
| 472 |
+
"""
|
| 473 |
+
# Apply config from file to environment
|
| 474 |
+
_apply_config_to_env()
|
| 475 |
+
|
| 476 |
+
import asyncio
|
| 477 |
+
from coderag.mcp.handlers import get_mcp_handlers
|
| 478 |
+
|
| 479 |
+
handlers = get_mcp_handlers()
|
| 480 |
+
|
| 481 |
+
# First get repo info for confirmation
|
| 482 |
+
async def get_repo_info():
|
| 483 |
+
result = await handlers.get_repository_info(repo_id=repo_id)
|
| 484 |
+
return result
|
| 485 |
+
|
| 486 |
+
info = asyncio.run(get_repo_info())
|
| 487 |
+
|
| 488 |
+
if info.get("error"):
|
| 489 |
+
click.echo(f"❌ Error: {info['error']}")
|
| 490 |
+
sys.exit(1)
|
| 491 |
+
|
| 492 |
+
repo_name = info.get("name", repo_id)
|
| 493 |
+
chunk_count = info.get("chunk_count", 0)
|
| 494 |
+
|
| 495 |
+
if not force:
|
| 496 |
+
click.echo(f"\n⚠️ About to delete: {repo_name}")
|
| 497 |
+
click.echo(f" Chunks to delete: {chunk_count}")
|
| 498 |
+
if not click.confirm("\nAre you sure?"):
|
| 499 |
+
click.echo("Cancelled.")
|
| 500 |
+
return
|
| 501 |
+
|
| 502 |
+
async def run_delete():
|
| 503 |
+
result = await handlers.delete_repository(repo_id=repo_id)
|
| 504 |
+
return result
|
| 505 |
+
|
| 506 |
+
result = asyncio.run(run_delete())
|
| 507 |
+
|
| 508 |
+
if result.get("error"):
|
| 509 |
+
click.echo(f"❌ Error: {result['error']}")
|
| 510 |
+
sys.exit(1)
|
| 511 |
+
|
| 512 |
+
click.echo(f"\n✅ Repository deleted: {result.get('name', repo_id)}")
|
| 513 |
+
click.echo(f" Chunks removed: {result.get('chunks_deleted', 0)}")
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
@cli.command("clean")
|
| 517 |
+
@click.option("--force", "-f", is_flag=True, help="Skip confirmation prompt")
|
| 518 |
+
def clean(force: bool):
|
| 519 |
+
"""Clean up repositories with errors or stuck in indexing.
|
| 520 |
+
|
| 521 |
+
Removes all repositories that have status 'error' or have been stuck
|
| 522 |
+
in 'indexing' or 'pending' status for too long.
|
| 523 |
+
|
| 524 |
+
Example: coderag clean
|
| 525 |
+
"""
|
| 526 |
+
# Apply config from file to environment
|
| 527 |
+
_apply_config_to_env()
|
| 528 |
+
|
| 529 |
+
import asyncio
|
| 530 |
+
from coderag.mcp.handlers import get_mcp_handlers
|
| 531 |
+
|
| 532 |
+
handlers = get_mcp_handlers()
|
| 533 |
+
|
| 534 |
+
async def get_repos():
|
| 535 |
+
result = await handlers.list_repositories()
|
| 536 |
+
return result
|
| 537 |
+
|
| 538 |
+
result = asyncio.run(get_repos())
|
| 539 |
+
repos = result.get("repositories", [])
|
| 540 |
+
|
| 541 |
+
# Find repos to clean
|
| 542 |
+
to_clean = [r for r in repos if r["status"] in ("error", "indexing", "pending")]
|
| 543 |
+
|
| 544 |
+
if not to_clean:
|
| 545 |
+
click.echo("\n✅ No repositories need cleaning.")
|
| 546 |
+
return
|
| 547 |
+
|
| 548 |
+
click.echo(f"\n🧹 Found {len(to_clean)} repository(ies) to clean:\n")
|
| 549 |
+
for repo in to_clean:
|
| 550 |
+
status_icon = "❌" if repo["status"] == "error" else "⏳"
|
| 551 |
+
click.echo(f" {status_icon} {repo['id'][:8]} {repo['name']} ({repo['status']})")
|
| 552 |
+
|
| 553 |
+
if not force:
|
| 554 |
+
if not click.confirm(f"\nDelete these {len(to_clean)} repositories?"):
|
| 555 |
+
click.echo("Cancelled.")
|
| 556 |
+
return
|
| 557 |
+
|
| 558 |
+
# Delete each repo
|
| 559 |
+
deleted = 0
|
| 560 |
+
for repo in to_clean:
|
| 561 |
+
async def run_delete():
|
| 562 |
+
return await handlers.delete_repository(repo_id=repo["id"])
|
| 563 |
+
|
| 564 |
+
try:
|
| 565 |
+
result = asyncio.run(run_delete())
|
| 566 |
+
if result.get("success"):
|
| 567 |
+
deleted += 1
|
| 568 |
+
click.echo(f" ✅ Deleted: {repo['name']}")
|
| 569 |
+
else:
|
| 570 |
+
click.echo(f" ❌ Failed: {repo['name']} - {result.get('error', 'Unknown')}")
|
| 571 |
+
except Exception as e:
|
| 572 |
+
click.echo(f" ❌ Failed: {repo['name']} - {str(e)}")
|
| 573 |
+
|
| 574 |
+
click.echo(f"\n✅ Cleaned {deleted}/{len(to_clean)} repositories.")
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
@cli.command("doctor")
|
| 578 |
+
def doctor():
|
| 579 |
+
"""Diagnose common issues with CodeRAG setup.
|
| 580 |
+
|
| 581 |
+
Checks Python version, configuration, API key validity, and system components.
|
| 582 |
+
"""
|
| 583 |
+
click.echo("\n🏥 CodeRAG Doctor\n")
|
| 584 |
+
all_ok = True
|
| 585 |
+
|
| 586 |
+
# Check Python version
|
| 587 |
+
py_version = sys.version_info
|
| 588 |
+
if py_version >= (3, 11):
|
| 589 |
+
click.echo(f"✅ Python version: {py_version.major}.{py_version.minor}.{py_version.micro}")
|
| 590 |
+
else:
|
| 591 |
+
click.echo(f"❌ Python version: {py_version.major}.{py_version.minor}.{py_version.micro} (need 3.11+)")
|
| 592 |
+
all_ok = False
|
| 593 |
+
|
| 594 |
+
# Check config file
|
| 595 |
+
config = get_config()
|
| 596 |
+
if config:
|
| 597 |
+
click.echo(f"✅ Config file exists: {CONFIG_FILE}")
|
| 598 |
+
if config.get("llm_provider"):
|
| 599 |
+
click.echo(f" Provider: {config['llm_provider']}")
|
| 600 |
+
else:
|
| 601 |
+
click.echo(f"⚠️ No config file. Run 'coderag setup' to configure.")
|
| 602 |
+
|
| 603 |
+
# Check API key
|
| 604 |
+
api_key = config.get("llm_api_key") or os.environ.get("MODEL_LLM_API_KEY")
|
| 605 |
+
provider = config.get("llm_provider") or os.environ.get("MODEL_LLM_PROVIDER", "groq")
|
| 606 |
+
|
| 607 |
+
if provider != "local":
|
| 608 |
+
if api_key:
|
| 609 |
+
click.echo(f"✅ API key configured (provider: {provider})")
|
| 610 |
+
else:
|
| 611 |
+
click.echo(f"❌ No API key configured for {provider}")
|
| 612 |
+
all_ok = False
|
| 613 |
+
|
| 614 |
+
# Check CUDA
|
| 615 |
+
try:
|
| 616 |
+
import torch
|
| 617 |
+
if torch.cuda.is_available():
|
| 618 |
+
click.echo(f"✅ CUDA available: {torch.cuda.get_device_name(0)}")
|
| 619 |
+
else:
|
| 620 |
+
click.echo("ℹ️ CUDA not available (CPU mode for embeddings)")
|
| 621 |
+
except ImportError:
|
| 622 |
+
click.echo("⚠️ PyTorch not installed")
|
| 623 |
+
all_ok = False
|
| 624 |
+
|
| 625 |
+
# Check ChromaDB data directory
|
| 626 |
+
from coderag.config import get_settings
|
| 627 |
+
settings = get_settings()
|
| 628 |
+
chroma_path = settings.vectorstore.persist_directory
|
| 629 |
+
if chroma_path.exists():
|
| 630 |
+
click.echo(f"✅ ChromaDB directory: {chroma_path}")
|
| 631 |
+
else:
|
| 632 |
+
click.echo(f"ℹ️ ChromaDB directory will be created: {chroma_path}")
|
| 633 |
+
|
| 634 |
+
# Check Claude Desktop
|
| 635 |
+
claude_config = get_claude_config_path()
|
| 636 |
+
if claude_config and claude_config.exists():
|
| 637 |
+
try:
|
| 638 |
+
config_data = json.loads(claude_config.read_text())
|
| 639 |
+
if "coderag" in config_data.get("mcpServers", {}):
|
| 640 |
+
click.echo("✅ Claude Desktop MCP configured")
|
| 641 |
+
else:
|
| 642 |
+
click.echo("ℹ️ Claude Desktop installed but MCP not configured. Run 'coderag mcp-install'")
|
| 643 |
+
except Exception:
|
| 644 |
+
click.echo("⚠️ Claude Desktop config exists but could not be read")
|
| 645 |
+
else:
|
| 646 |
+
click.echo("ℹ️ Claude Desktop not detected")
|
| 647 |
+
|
| 648 |
+
# Summary
|
| 649 |
+
if all_ok:
|
| 650 |
+
click.echo("\n✅ All checks passed!")
|
| 651 |
+
else:
|
| 652 |
+
click.echo("\n⚠️ Some issues detected. See above for details.")
|
| 653 |
+
|
| 654 |
+
|
| 655 |
+
def _apply_config_to_env():
|
| 656 |
+
"""Apply configuration from config file to environment variables."""
|
| 657 |
+
config = get_config()
|
| 658 |
+
|
| 659 |
+
if config.get("llm_provider") and not os.environ.get("MODEL_LLM_PROVIDER"):
|
| 660 |
+
os.environ["MODEL_LLM_PROVIDER"] = config["llm_provider"]
|
| 661 |
+
|
| 662 |
+
if config.get("llm_api_key") and not os.environ.get("MODEL_LLM_API_KEY"):
|
| 663 |
+
os.environ["MODEL_LLM_API_KEY"] = config["llm_api_key"]
|
| 664 |
+
|
| 665 |
+
if config.get("embedding_device") and not os.environ.get("MODEL_EMBEDDING_DEVICE"):
|
| 666 |
+
os.environ["MODEL_EMBEDDING_DEVICE"] = config["embedding_device"]
|
| 667 |
+
|
| 668 |
+
|
| 669 |
+
def main():
|
| 670 |
+
"""Entry point for the CLI."""
|
| 671 |
+
cli()
|
| 672 |
+
|
| 673 |
+
|
| 674 |
+
if __name__ == "__main__":
|
| 675 |
+
main()
|
coderag/config.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application configuration using pydantic-settings."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from pydantic import Field
|
| 7 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class ModelSettings(BaseSettings):
|
| 11 |
+
"""LLM and embedding model configuration."""
|
| 12 |
+
|
| 13 |
+
model_config = SettingsConfigDict(env_prefix="MODEL_")
|
| 14 |
+
|
| 15 |
+
# LLM Provider: "local", "openai", "groq", "anthropic", "openrouter"
|
| 16 |
+
# Default to "groq" (free tier available, no GPU required)
|
| 17 |
+
llm_provider: str = "groq"
|
| 18 |
+
|
| 19 |
+
# API settings (for remote providers)
|
| 20 |
+
llm_api_key: Optional[str] = None
|
| 21 |
+
llm_api_base: Optional[str] = None # Custom API base URL
|
| 22 |
+
|
| 23 |
+
# Model name (local or remote)
|
| 24 |
+
llm_name: str = "Qwen/Qwen2.5-Coder-3B-Instruct"
|
| 25 |
+
llm_max_new_tokens: int = 1024
|
| 26 |
+
llm_temperature: float = 0.1
|
| 27 |
+
llm_top_p: float = 0.95
|
| 28 |
+
|
| 29 |
+
# Local model settings
|
| 30 |
+
llm_use_4bit: bool = True
|
| 31 |
+
llm_device_map: str = "auto"
|
| 32 |
+
|
| 33 |
+
embedding_name: str = "nomic-ai/nomic-embed-text-v1.5"
|
| 34 |
+
embedding_dimension: int = 768
|
| 35 |
+
embedding_batch_size: int = 8 # Reduced for 8GB VRAM GPUs
|
| 36 |
+
embedding_device: str = "auto" # "auto" detects CUDA, falls back to CPU
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class VectorStoreSettings(BaseSettings):
|
| 40 |
+
"""ChromaDB vector store configuration."""
|
| 41 |
+
|
| 42 |
+
model_config = SettingsConfigDict(env_prefix="VECTORSTORE_")
|
| 43 |
+
|
| 44 |
+
persist_directory: Path = Path("./data/chroma_db")
|
| 45 |
+
collection_name: str = "coderag_chunks"
|
| 46 |
+
distance_metric: str = "cosine"
|
| 47 |
+
anonymized_telemetry: bool = False
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class IngestionSettings(BaseSettings):
|
| 51 |
+
"""Repository ingestion configuration."""
|
| 52 |
+
|
| 53 |
+
model_config = SettingsConfigDict(env_prefix="INGESTION_")
|
| 54 |
+
|
| 55 |
+
repos_cache_dir: Path = Path("./data/repos")
|
| 56 |
+
max_file_size_kb: int = 500
|
| 57 |
+
default_branch: str = "main"
|
| 58 |
+
chunk_size: int = 1500
|
| 59 |
+
chunk_overlap: int = 200
|
| 60 |
+
|
| 61 |
+
# Large repository handling
|
| 62 |
+
max_files_per_repo: int = 5000
|
| 63 |
+
max_total_chunks: int = 50000
|
| 64 |
+
batch_size: int = 100
|
| 65 |
+
stream_processing: bool = True
|
| 66 |
+
|
| 67 |
+
# Warning thresholds
|
| 68 |
+
warn_files_threshold: int = 1000
|
| 69 |
+
warn_chunks_threshold: int = 10000
|
| 70 |
+
|
| 71 |
+
include_patterns: list[str] = Field(
|
| 72 |
+
default_factory=lambda: ["*.py", "*.js", "*.ts", "*.java", "*.go", "*.rs", "*.c", "*.cpp", "*.h"]
|
| 73 |
+
)
|
| 74 |
+
exclude_patterns: list[str] = Field(
|
| 75 |
+
default_factory=lambda: [
|
| 76 |
+
"**/node_modules/**",
|
| 77 |
+
"**/.git/**",
|
| 78 |
+
"**/venv/**",
|
| 79 |
+
"**/__pycache__/**",
|
| 80 |
+
"**/dist/**",
|
| 81 |
+
"**/build/**",
|
| 82 |
+
"**/*.min.js",
|
| 83 |
+
"**/*.min.css",
|
| 84 |
+
"**/package-lock.json",
|
| 85 |
+
"**/yarn.lock",
|
| 86 |
+
"**/poetry.lock",
|
| 87 |
+
"**/.env",
|
| 88 |
+
"**/.env.*",
|
| 89 |
+
"**/credentials*",
|
| 90 |
+
"**/*secret*",
|
| 91 |
+
"**/*password*",
|
| 92 |
+
]
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class RetrievalSettings(BaseSettings):
|
| 97 |
+
"""Retrieval configuration."""
|
| 98 |
+
|
| 99 |
+
model_config = SettingsConfigDict(env_prefix="RETRIEVAL_")
|
| 100 |
+
|
| 101 |
+
default_top_k: int = 5
|
| 102 |
+
max_top_k: int = 20
|
| 103 |
+
similarity_threshold: float = 0.3
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class ServerSettings(BaseSettings):
|
| 107 |
+
"""Server configuration."""
|
| 108 |
+
|
| 109 |
+
model_config = SettingsConfigDict(env_prefix="SERVER_")
|
| 110 |
+
|
| 111 |
+
host: str = "0.0.0.0"
|
| 112 |
+
port: int = 8000
|
| 113 |
+
reload: bool = False
|
| 114 |
+
workers: int = 1
|
| 115 |
+
log_level: str = "info"
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class Settings(BaseSettings):
|
| 119 |
+
"""Main application settings."""
|
| 120 |
+
|
| 121 |
+
model_config = SettingsConfigDict(
|
| 122 |
+
env_file=".env",
|
| 123 |
+
env_file_encoding="utf-8",
|
| 124 |
+
extra="ignore",
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
app_name: str = "CodeRAG"
|
| 128 |
+
app_version: str = "0.1.0"
|
| 129 |
+
debug: bool = False
|
| 130 |
+
data_dir: Path = Path("./data")
|
| 131 |
+
|
| 132 |
+
models: ModelSettings = Field(default_factory=ModelSettings)
|
| 133 |
+
vectorstore: VectorStoreSettings = Field(default_factory=VectorStoreSettings)
|
| 134 |
+
ingestion: IngestionSettings = Field(default_factory=IngestionSettings)
|
| 135 |
+
retrieval: RetrievalSettings = Field(default_factory=RetrievalSettings)
|
| 136 |
+
server: ServerSettings = Field(default_factory=ServerSettings)
|
| 137 |
+
|
| 138 |
+
def ensure_directories(self) -> None:
|
| 139 |
+
"""Create required directories if they don't exist."""
|
| 140 |
+
self.data_dir.mkdir(parents=True, exist_ok=True)
|
| 141 |
+
self.vectorstore.persist_directory.mkdir(parents=True, exist_ok=True)
|
| 142 |
+
self.ingestion.repos_cache_dir.mkdir(parents=True, exist_ok=True)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
_settings: Optional[Settings] = None
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def get_settings() -> Settings:
|
| 149 |
+
"""Get cached settings instance."""
|
| 150 |
+
global _settings
|
| 151 |
+
if _settings is None:
|
| 152 |
+
_settings = Settings()
|
| 153 |
+
_settings.ensure_directories()
|
| 154 |
+
return _settings
|
coderag/generation/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generation module: LLM inference and response generation with citations."""
|
| 2 |
+
|
| 3 |
+
from coderag.generation.generator import ResponseGenerator
|
| 4 |
+
from coderag.generation.prompts import SYSTEM_PROMPT, build_prompt
|
| 5 |
+
from coderag.generation.citations import CitationParser
|
| 6 |
+
|
| 7 |
+
__all__ = ["ResponseGenerator", "SYSTEM_PROMPT", "build_prompt", "CitationParser"]
|
coderag/generation/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (531 Bytes). View file
|
|
|
coderag/generation/__pycache__/citations.cpython-313.pyc
ADDED
|
Binary file (3.8 kB). View file
|
|
|
coderag/generation/__pycache__/generator.cpython-313.pyc
ADDED
|
Binary file (10.5 kB). View file
|
|
|
coderag/generation/__pycache__/prompts.cpython-313.pyc
ADDED
|
Binary file (3.15 kB). View file
|
|
|
coderag/generation/citations.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Citation parsing and formatting."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from coderag.models.response import Citation
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class CitationParser:
|
| 10 |
+
"""Parses and validates citations from LLM responses."""
|
| 11 |
+
|
| 12 |
+
# Pattern to match citations like [file.py:10-20] or [path/to/file.py:10-20]
|
| 13 |
+
CITATION_PATTERN = re.compile(r"\[([^\]]+):(\d+)-(\d+)\]")
|
| 14 |
+
|
| 15 |
+
def parse_citations(self, text: str) -> list[Citation]:
|
| 16 |
+
"""Extract all citations from text.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
text: Text containing citations
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
List of parsed Citation objects
|
| 23 |
+
"""
|
| 24 |
+
citations = []
|
| 25 |
+
for match in self.CITATION_PATTERN.finditer(text):
|
| 26 |
+
file_path = match.group(1)
|
| 27 |
+
start_line = int(match.group(2))
|
| 28 |
+
end_line = int(match.group(3))
|
| 29 |
+
|
| 30 |
+
citations.append(Citation(
|
| 31 |
+
file_path=file_path,
|
| 32 |
+
start_line=start_line,
|
| 33 |
+
end_line=end_line,
|
| 34 |
+
))
|
| 35 |
+
|
| 36 |
+
return citations
|
| 37 |
+
|
| 38 |
+
def validate_citation(self, citation: Citation, available_files: set[str]) -> bool:
|
| 39 |
+
"""Check if a citation references an existing file."""
|
| 40 |
+
return citation.file_path in available_files
|
| 41 |
+
|
| 42 |
+
def validate_citations(
|
| 43 |
+
self,
|
| 44 |
+
citations: list[Citation],
|
| 45 |
+
available_files: set[str],
|
| 46 |
+
) -> tuple[list[Citation], list[Citation]]:
|
| 47 |
+
"""Validate multiple citations.
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Tuple of (valid_citations, invalid_citations)
|
| 51 |
+
"""
|
| 52 |
+
valid = []
|
| 53 |
+
invalid = []
|
| 54 |
+
|
| 55 |
+
for citation in citations:
|
| 56 |
+
if self.validate_citation(citation, available_files):
|
| 57 |
+
valid.append(citation)
|
| 58 |
+
else:
|
| 59 |
+
invalid.append(citation)
|
| 60 |
+
|
| 61 |
+
return valid, invalid
|
| 62 |
+
|
| 63 |
+
def format_citation(self, file_path: str, start_line: int, end_line: int) -> str:
|
| 64 |
+
"""Format a citation string."""
|
| 65 |
+
return f"[{file_path}:{start_line}-{end_line}]"
|
| 66 |
+
|
| 67 |
+
def has_citations(self, text: str) -> bool:
|
| 68 |
+
"""Check if text contains any citations."""
|
| 69 |
+
return bool(self.CITATION_PATTERN.search(text))
|
| 70 |
+
|
| 71 |
+
def count_citations(self, text: str) -> int:
|
| 72 |
+
"""Count citations in text."""
|
| 73 |
+
return len(self.CITATION_PATTERN.findall(text))
|
| 74 |
+
|
| 75 |
+
def extract_unique_files(self, citations: list[Citation]) -> set[str]:
|
| 76 |
+
"""Get unique file paths from citations."""
|
| 77 |
+
return {c.file_path for c in citations}
|
coderag/generation/generator.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Response generation using local or remote LLMs."""
|
| 2 |
+
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
from coderag.config import get_settings
|
| 6 |
+
from coderag.generation.citations import CitationParser
|
| 7 |
+
from coderag.generation.prompts import SYSTEM_PROMPT, build_prompt, build_no_context_response
|
| 8 |
+
from coderag.logging import get_logger
|
| 9 |
+
from coderag.models.response import Response
|
| 10 |
+
from coderag.models.query import Query
|
| 11 |
+
from coderag.retrieval.retriever import Retriever
|
| 12 |
+
|
| 13 |
+
logger = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ResponseGenerator:
|
| 17 |
+
"""Generates grounded responses using local or remote LLMs."""
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
retriever: Optional[Retriever] = None,
|
| 22 |
+
) -> None:
|
| 23 |
+
self.settings = get_settings()
|
| 24 |
+
self.retriever = retriever or Retriever()
|
| 25 |
+
self.citation_parser = CitationParser()
|
| 26 |
+
|
| 27 |
+
self.provider = self.settings.models.llm_provider.lower()
|
| 28 |
+
self._client = None
|
| 29 |
+
self._local_model = None
|
| 30 |
+
self._local_tokenizer = None
|
| 31 |
+
|
| 32 |
+
logger.info("ResponseGenerator initialized", provider=self.provider)
|
| 33 |
+
|
| 34 |
+
def _get_api_client(self):
|
| 35 |
+
"""Get or create API client for remote providers."""
|
| 36 |
+
if self._client is not None:
|
| 37 |
+
return self._client
|
| 38 |
+
|
| 39 |
+
import httpx
|
| 40 |
+
from openai import OpenAI
|
| 41 |
+
|
| 42 |
+
api_key = self.settings.models.llm_api_key
|
| 43 |
+
if not api_key:
|
| 44 |
+
raise ValueError(f"API key required for provider: {self.provider}")
|
| 45 |
+
|
| 46 |
+
# Provider-specific configurations
|
| 47 |
+
provider_configs = {
|
| 48 |
+
"openai": {
|
| 49 |
+
"base_url": "https://api.openai.com/v1",
|
| 50 |
+
"default_model": "gpt-4o-mini",
|
| 51 |
+
},
|
| 52 |
+
"groq": {
|
| 53 |
+
"base_url": "https://api.groq.com/openai/v1",
|
| 54 |
+
"default_model": "llama-3.3-70b-versatile",
|
| 55 |
+
},
|
| 56 |
+
"anthropic": {
|
| 57 |
+
"base_url": "https://api.anthropic.com/v1",
|
| 58 |
+
"default_model": "claude-3-5-sonnet-20241022",
|
| 59 |
+
},
|
| 60 |
+
"openrouter": {
|
| 61 |
+
"base_url": "https://openrouter.ai/api/v1",
|
| 62 |
+
"default_model": "anthropic/claude-3.5-sonnet",
|
| 63 |
+
},
|
| 64 |
+
"together": {
|
| 65 |
+
"base_url": "https://api.together.xyz/v1",
|
| 66 |
+
"default_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
|
| 67 |
+
},
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
config = provider_configs.get(self.provider, {})
|
| 71 |
+
base_url = self.settings.models.llm_api_base or config.get("base_url")
|
| 72 |
+
|
| 73 |
+
if not base_url:
|
| 74 |
+
raise ValueError(f"Unknown provider: {self.provider}")
|
| 75 |
+
|
| 76 |
+
# Set default model if not specified and it's a known provider
|
| 77 |
+
if self.settings.models.llm_name.startswith("Qwen/"):
|
| 78 |
+
self.model_name = config.get("default_model", self.settings.models.llm_name)
|
| 79 |
+
else:
|
| 80 |
+
self.model_name = self.settings.models.llm_name
|
| 81 |
+
|
| 82 |
+
self._client = OpenAI(
|
| 83 |
+
api_key=api_key,
|
| 84 |
+
base_url=base_url,
|
| 85 |
+
http_client=httpx.Client(timeout=120.0),
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
logger.info("API client created", provider=self.provider, model=self.model_name)
|
| 89 |
+
return self._client
|
| 90 |
+
|
| 91 |
+
def _load_local_model(self):
|
| 92 |
+
"""Load local model with transformers."""
|
| 93 |
+
if self._local_model is not None:
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
import torch
|
| 97 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 98 |
+
|
| 99 |
+
if not torch.cuda.is_available():
|
| 100 |
+
raise RuntimeError(
|
| 101 |
+
"Local LLM requires a CUDA-capable GPU. Options:\n"
|
| 102 |
+
" 1. Use a cloud provider (free): MODEL_LLM_PROVIDER=groq\n"
|
| 103 |
+
" Get API key at: https://console.groq.com/keys\n"
|
| 104 |
+
" 2. Install CUDA and a compatible GPU"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
logger.info("Loading local LLM", model=self.settings.models.llm_name)
|
| 108 |
+
|
| 109 |
+
if self.settings.models.llm_use_4bit:
|
| 110 |
+
bnb_config = BitsAndBytesConfig(
|
| 111 |
+
load_in_4bit=True,
|
| 112 |
+
bnb_4bit_quant_type="nf4",
|
| 113 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 114 |
+
bnb_4bit_use_double_quant=True,
|
| 115 |
+
)
|
| 116 |
+
else:
|
| 117 |
+
bnb_config = None
|
| 118 |
+
|
| 119 |
+
self._local_tokenizer = AutoTokenizer.from_pretrained(
|
| 120 |
+
self.settings.models.llm_name,
|
| 121 |
+
trust_remote_code=True,
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
self._local_model = AutoModelForCausalLM.from_pretrained(
|
| 125 |
+
self.settings.models.llm_name,
|
| 126 |
+
quantization_config=bnb_config,
|
| 127 |
+
device_map=self.settings.models.llm_device_map,
|
| 128 |
+
trust_remote_code=True,
|
| 129 |
+
torch_dtype=torch.float16,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
logger.info("Local LLM loaded successfully")
|
| 133 |
+
|
| 134 |
+
def generate(self, query: Query) -> Response:
|
| 135 |
+
"""Generate a response for a query."""
|
| 136 |
+
# Retrieve relevant chunks
|
| 137 |
+
chunks, context = self.retriever.retrieve_with_context(
|
| 138 |
+
query.question,
|
| 139 |
+
query.repo_id,
|
| 140 |
+
query.top_k,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# Handle no results
|
| 144 |
+
if not chunks:
|
| 145 |
+
return Response(
|
| 146 |
+
answer=build_no_context_response(),
|
| 147 |
+
citations=[],
|
| 148 |
+
retrieved_chunks=[],
|
| 149 |
+
grounded=False,
|
| 150 |
+
query_id=query.id,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# Build prompt and generate
|
| 154 |
+
prompt = build_prompt(query.question, context)
|
| 155 |
+
|
| 156 |
+
if self.provider == "local":
|
| 157 |
+
answer = self._generate_local(prompt)
|
| 158 |
+
else:
|
| 159 |
+
answer = self._generate_api(prompt)
|
| 160 |
+
|
| 161 |
+
# Parse citations from answer
|
| 162 |
+
citations = self.citation_parser.parse_citations(answer)
|
| 163 |
+
|
| 164 |
+
# Determine if response is grounded
|
| 165 |
+
grounded = len(citations) > 0 and len(chunks) > 0
|
| 166 |
+
|
| 167 |
+
return Response(
|
| 168 |
+
answer=answer,
|
| 169 |
+
citations=citations,
|
| 170 |
+
retrieved_chunks=chunks,
|
| 171 |
+
grounded=grounded,
|
| 172 |
+
query_id=query.id,
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
def _generate_api(self, prompt: str) -> str:
|
| 176 |
+
"""Generate using remote API."""
|
| 177 |
+
client = self._get_api_client()
|
| 178 |
+
|
| 179 |
+
messages = [
|
| 180 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 181 |
+
{"role": "user", "content": prompt},
|
| 182 |
+
]
|
| 183 |
+
|
| 184 |
+
response = client.chat.completions.create(
|
| 185 |
+
model=self.model_name,
|
| 186 |
+
messages=messages,
|
| 187 |
+
max_tokens=self.settings.models.llm_max_new_tokens,
|
| 188 |
+
temperature=self.settings.models.llm_temperature,
|
| 189 |
+
top_p=self.settings.models.llm_top_p,
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
return response.choices[0].message.content.strip()
|
| 193 |
+
|
| 194 |
+
def _generate_local(self, prompt: str) -> str:
|
| 195 |
+
"""Generate using local model."""
|
| 196 |
+
import torch
|
| 197 |
+
|
| 198 |
+
self._load_local_model()
|
| 199 |
+
|
| 200 |
+
messages = [
|
| 201 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 202 |
+
{"role": "user", "content": prompt},
|
| 203 |
+
]
|
| 204 |
+
|
| 205 |
+
text = self._local_tokenizer.apply_chat_template(
|
| 206 |
+
messages,
|
| 207 |
+
tokenize=False,
|
| 208 |
+
add_generation_prompt=True,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
inputs = self._local_tokenizer(text, return_tensors="pt").to(self._local_model.device)
|
| 212 |
+
|
| 213 |
+
with torch.no_grad():
|
| 214 |
+
outputs = self._local_model.generate(
|
| 215 |
+
**inputs,
|
| 216 |
+
max_new_tokens=self.settings.models.llm_max_new_tokens,
|
| 217 |
+
temperature=self.settings.models.llm_temperature,
|
| 218 |
+
top_p=self.settings.models.llm_top_p,
|
| 219 |
+
do_sample=True,
|
| 220 |
+
pad_token_id=self._local_tokenizer.eos_token_id,
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
generated = outputs[0][inputs["input_ids"].shape[1]:]
|
| 224 |
+
response = self._local_tokenizer.decode(generated, skip_special_tokens=True)
|
| 225 |
+
|
| 226 |
+
return response.strip()
|
| 227 |
+
|
| 228 |
+
def unload(self) -> None:
|
| 229 |
+
"""Unload models from memory."""
|
| 230 |
+
if self._local_model is not None:
|
| 231 |
+
del self._local_model
|
| 232 |
+
self._local_model = None
|
| 233 |
+
if self._local_tokenizer is not None:
|
| 234 |
+
del self._local_tokenizer
|
| 235 |
+
self._local_tokenizer = None
|
| 236 |
+
|
| 237 |
+
import torch
|
| 238 |
+
if torch.cuda.is_available():
|
| 239 |
+
torch.cuda.empty_cache()
|
| 240 |
+
|
| 241 |
+
logger.info("Models unloaded")
|
coderag/generation/prompts.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""System prompts for grounded code Q&A."""
|
| 2 |
+
|
| 3 |
+
SYSTEM_PROMPT = """You are a code assistant that answers questions about a repository.
|
| 4 |
+
|
| 5 |
+
CRITICAL RULES - YOU MUST FOLLOW THESE:
|
| 6 |
+
|
| 7 |
+
1. FIRST, check if the retrieved chunks are RELEVANT to the question being asked.
|
| 8 |
+
- If the chunks discuss completely different topics than the question, respond:
|
| 9 |
+
"I could not find information about this in the indexed repository."
|
| 10 |
+
- Do NOT try to make connections that don't exist.
|
| 11 |
+
|
| 12 |
+
2. Only answer based on EXPLICIT information in the provided code chunks.
|
| 13 |
+
- Every claim MUST have a citation: [file_path:start_line-end_line]
|
| 14 |
+
- If you cannot cite it, do NOT say it.
|
| 15 |
+
|
| 16 |
+
3. NEVER HALLUCINATE:
|
| 17 |
+
- Do NOT invent code, functions, files, or behaviors
|
| 18 |
+
- Do NOT answer questions about topics not in the chunks (e.g., if asked about "food inventory" but chunks are about "code embeddings", say you don't have that information)
|
| 19 |
+
- Do NOT make assumptions about what the code might do
|
| 20 |
+
|
| 21 |
+
4. When to refuse:
|
| 22 |
+
- The question is about something not covered in the chunks
|
| 23 |
+
- The chunks are about a completely different topic
|
| 24 |
+
- You would need to guess or speculate
|
| 25 |
+
|
| 26 |
+
CITATION FORMAT: [file_path:start_line-end_line]
|
| 27 |
+
Example: [src/auth.py:45-78]
|
| 28 |
+
|
| 29 |
+
RESPONSE FORMAT:
|
| 30 |
+
- Start with a direct answer IF AND ONLY IF the chunks contain relevant information
|
| 31 |
+
- Include citations inline with every factual statement
|
| 32 |
+
- If showing code, quote it exactly from the chunks"""
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def build_prompt(question: str, context: str) -> str:
|
| 36 |
+
"""Build the full prompt with context and question.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
question: User's question
|
| 40 |
+
context: Retrieved code chunks formatted as context
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Complete prompt for the LLM
|
| 44 |
+
"""
|
| 45 |
+
return f"""Based on the following code chunks from the repository, answer the question.
|
| 46 |
+
|
| 47 |
+
## Retrieved Code Chunks
|
| 48 |
+
|
| 49 |
+
{context}
|
| 50 |
+
|
| 51 |
+
## Question
|
| 52 |
+
|
| 53 |
+
{question}
|
| 54 |
+
|
| 55 |
+
## Answer
|
| 56 |
+
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def build_no_context_response() -> str:
|
| 61 |
+
"""Response when no relevant context is found."""
|
| 62 |
+
return "I could not find information about this in the indexed repository."
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def build_clarification_prompt(question: str, ambiguities: list[str]) -> str:
|
| 66 |
+
"""Build prompt asking for clarification."""
|
| 67 |
+
ambiguity_list = "\n".join(f"- {a}" for a in ambiguities)
|
| 68 |
+
return f"""Your question "{question}" is ambiguous. Could you clarify:
|
| 69 |
+
|
| 70 |
+
{ambiguity_list}
|
| 71 |
+
|
| 72 |
+
Please provide more specific details so I can give you an accurate answer."""
|
coderag/indexing/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Indexing module: Embedding generation and vector storage."""
|
| 2 |
+
|
| 3 |
+
from coderag.indexing.embeddings import EmbeddingGenerator
|
| 4 |
+
from coderag.indexing.vectorstore import VectorStore
|
| 5 |
+
|
| 6 |
+
__all__ = ["EmbeddingGenerator", "VectorStore"]
|
coderag/indexing/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (416 Bytes). View file
|
|
|
coderag/indexing/__pycache__/embeddings.cpython-313.pyc
ADDED
|
Binary file (8.05 kB). View file
|
|
|
coderag/indexing/__pycache__/vectorstore.cpython-313.pyc
ADDED
|
Binary file (8.95 kB). View file
|
|
|
coderag/indexing/embeddings.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Embedding generation using nomic-embed-text."""
|
| 2 |
+
|
| 3 |
+
from typing import Iterator, Optional
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
|
| 8 |
+
from coderag.config import get_settings
|
| 9 |
+
from coderag.logging import get_logger
|
| 10 |
+
from coderag.models.chunk import Chunk
|
| 11 |
+
|
| 12 |
+
logger = get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class EmbeddingGenerator:
|
| 16 |
+
"""Generates embeddings using nomic-embed-text v1.5."""
|
| 17 |
+
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
model_name: Optional[str] = None,
|
| 21 |
+
device: Optional[str] = None,
|
| 22 |
+
batch_size: Optional[int] = None,
|
| 23 |
+
) -> None:
|
| 24 |
+
settings = get_settings()
|
| 25 |
+
self.model_name = model_name or settings.models.embedding_name
|
| 26 |
+
self.device = self._resolve_device(device or settings.models.embedding_device)
|
| 27 |
+
self.batch_size = batch_size or settings.models.embedding_batch_size
|
| 28 |
+
self._model: Optional[SentenceTransformer] = None
|
| 29 |
+
|
| 30 |
+
def _resolve_device(self, device: str) -> str:
|
| 31 |
+
"""Resolve device, falling back to CPU if CUDA unavailable."""
|
| 32 |
+
if device == "auto":
|
| 33 |
+
return "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
+
if device == "cuda" and not torch.cuda.is_available():
|
| 35 |
+
logger.warning("CUDA not available, falling back to CPU for embeddings")
|
| 36 |
+
return "cpu"
|
| 37 |
+
return device
|
| 38 |
+
|
| 39 |
+
@property
|
| 40 |
+
def model(self) -> SentenceTransformer:
|
| 41 |
+
if self._model is None:
|
| 42 |
+
self._load_model()
|
| 43 |
+
return self._model
|
| 44 |
+
|
| 45 |
+
def _load_model(self) -> None:
|
| 46 |
+
logger.info("Loading embedding model", model=self.model_name, device=self.device)
|
| 47 |
+
self._model = SentenceTransformer(
|
| 48 |
+
self.model_name,
|
| 49 |
+
device=self.device,
|
| 50 |
+
trust_remote_code=True,
|
| 51 |
+
)
|
| 52 |
+
logger.info("Embedding model loaded")
|
| 53 |
+
|
| 54 |
+
def generate_embedding(self, text: str, is_query: bool = False) -> list[float]:
|
| 55 |
+
# nomic-embed uses task prefixes
|
| 56 |
+
if is_query:
|
| 57 |
+
text = f"search_query: {text}"
|
| 58 |
+
else:
|
| 59 |
+
text = f"search_document: {text}"
|
| 60 |
+
|
| 61 |
+
embedding = self.model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
|
| 62 |
+
return embedding.tolist()
|
| 63 |
+
|
| 64 |
+
def generate_embeddings(
|
| 65 |
+
self,
|
| 66 |
+
texts: list[str],
|
| 67 |
+
is_query: bool = False,
|
| 68 |
+
show_progress: bool = True,
|
| 69 |
+
) -> list[list[float]]:
|
| 70 |
+
# Add prefixes
|
| 71 |
+
if is_query:
|
| 72 |
+
texts = [f"search_query: {t}" for t in texts]
|
| 73 |
+
else:
|
| 74 |
+
texts = [f"search_document: {t}" for t in texts]
|
| 75 |
+
|
| 76 |
+
embeddings = self.model.encode(
|
| 77 |
+
texts,
|
| 78 |
+
batch_size=self.batch_size,
|
| 79 |
+
convert_to_numpy=True,
|
| 80 |
+
normalize_embeddings=True,
|
| 81 |
+
show_progress_bar=show_progress,
|
| 82 |
+
)
|
| 83 |
+
return embeddings.tolist()
|
| 84 |
+
|
| 85 |
+
def embed_chunks(
|
| 86 |
+
self,
|
| 87 |
+
chunks: list[Chunk],
|
| 88 |
+
show_progress: bool = True,
|
| 89 |
+
) -> list[Chunk]:
|
| 90 |
+
if not chunks:
|
| 91 |
+
return []
|
| 92 |
+
|
| 93 |
+
logger.info("Generating embeddings", num_chunks=len(chunks))
|
| 94 |
+
|
| 95 |
+
texts = [self._chunk_to_text(chunk) for chunk in chunks]
|
| 96 |
+
embeddings = self.generate_embeddings(texts, is_query=False, show_progress=show_progress)
|
| 97 |
+
|
| 98 |
+
for chunk, embedding in zip(chunks, embeddings):
|
| 99 |
+
chunk.embedding = embedding
|
| 100 |
+
|
| 101 |
+
logger.info("Embeddings generated", num_chunks=len(chunks))
|
| 102 |
+
return chunks
|
| 103 |
+
|
| 104 |
+
def embed_chunks_iter(
|
| 105 |
+
self,
|
| 106 |
+
chunks: Iterator[Chunk],
|
| 107 |
+
batch_size: Optional[int] = None,
|
| 108 |
+
) -> Iterator[Chunk]:
|
| 109 |
+
batch_size = batch_size or self.batch_size
|
| 110 |
+
batch: list[Chunk] = []
|
| 111 |
+
|
| 112 |
+
for chunk in chunks:
|
| 113 |
+
batch.append(chunk)
|
| 114 |
+
if len(batch) >= batch_size:
|
| 115 |
+
yield from self._embed_batch(batch)
|
| 116 |
+
batch = []
|
| 117 |
+
|
| 118 |
+
if batch:
|
| 119 |
+
yield from self._embed_batch(batch)
|
| 120 |
+
|
| 121 |
+
def _embed_batch(self, batch: list[Chunk]) -> Iterator[Chunk]:
|
| 122 |
+
texts = [self._chunk_to_text(chunk) for chunk in batch]
|
| 123 |
+
embeddings = self.generate_embeddings(texts, is_query=False, show_progress=False)
|
| 124 |
+
|
| 125 |
+
for chunk, embedding in zip(batch, embeddings):
|
| 126 |
+
chunk.embedding = embedding
|
| 127 |
+
yield chunk
|
| 128 |
+
|
| 129 |
+
def _chunk_to_text(self, chunk: Chunk) -> str:
|
| 130 |
+
parts = []
|
| 131 |
+
if chunk.name:
|
| 132 |
+
parts.append(f"{chunk.chunk_type.value}: {chunk.name}")
|
| 133 |
+
if chunk.metadata.signature:
|
| 134 |
+
parts.append(f"Signature: {chunk.metadata.signature}")
|
| 135 |
+
if chunk.metadata.docstring:
|
| 136 |
+
parts.append(f"Docstring: {chunk.metadata.docstring[:200]}")
|
| 137 |
+
parts.append(f"File: {chunk.file_path}")
|
| 138 |
+
parts.append(chunk.content)
|
| 139 |
+
return "\n".join(parts)
|
| 140 |
+
|
| 141 |
+
def unload(self) -> None:
|
| 142 |
+
if self._model is not None:
|
| 143 |
+
del self._model
|
| 144 |
+
self._model = None
|
| 145 |
+
if torch.cuda.is_available():
|
| 146 |
+
torch.cuda.empty_cache()
|
| 147 |
+
logger.info("Embedding model unloaded")
|
coderag/indexing/vectorstore.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ChromaDB vector store operations."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
import chromadb
|
| 7 |
+
from chromadb.config import Settings
|
| 8 |
+
|
| 9 |
+
from coderag.config import get_settings
|
| 10 |
+
from coderag.logging import get_logger
|
| 11 |
+
from coderag.models.chunk import Chunk
|
| 12 |
+
|
| 13 |
+
logger = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class VectorStore:
|
| 17 |
+
"""ChromaDB vector store for chunk storage and retrieval."""
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
persist_directory: Optional[Path] = None,
|
| 22 |
+
collection_name: Optional[str] = None,
|
| 23 |
+
) -> None:
|
| 24 |
+
settings = get_settings()
|
| 25 |
+
self.persist_directory = persist_directory or settings.vectorstore.persist_directory
|
| 26 |
+
self.collection_name = collection_name or settings.vectorstore.collection_name
|
| 27 |
+
self._client: Optional[chromadb.PersistentClient] = None
|
| 28 |
+
self._collection: Optional[chromadb.Collection] = None
|
| 29 |
+
|
| 30 |
+
@property
|
| 31 |
+
def client(self) -> chromadb.PersistentClient:
|
| 32 |
+
if self._client is None:
|
| 33 |
+
self._init_client()
|
| 34 |
+
return self._client
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def collection(self) -> chromadb.Collection:
|
| 38 |
+
if self._collection is None:
|
| 39 |
+
self._init_collection()
|
| 40 |
+
return self._collection
|
| 41 |
+
|
| 42 |
+
def _init_client(self) -> None:
|
| 43 |
+
logger.info("Initializing ChromaDB", path=str(self.persist_directory))
|
| 44 |
+
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
| 45 |
+
self._client = chromadb.PersistentClient(
|
| 46 |
+
path=str(self.persist_directory),
|
| 47 |
+
settings=Settings(anonymized_telemetry=False),
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
def _init_collection(self) -> None:
|
| 51 |
+
self._collection = self.client.get_or_create_collection(
|
| 52 |
+
name=self.collection_name,
|
| 53 |
+
metadata={"hnsw:space": "cosine"},
|
| 54 |
+
)
|
| 55 |
+
logger.info("Collection initialized", name=self.collection_name)
|
| 56 |
+
|
| 57 |
+
def add_chunks(self, chunks: list[Chunk]) -> int:
|
| 58 |
+
if not chunks:
|
| 59 |
+
return 0
|
| 60 |
+
|
| 61 |
+
ids = [chunk.id for chunk in chunks]
|
| 62 |
+
embeddings = [chunk.embedding for chunk in chunks if chunk.embedding]
|
| 63 |
+
documents = [chunk.content for chunk in chunks]
|
| 64 |
+
metadatas = [chunk.to_dict() for chunk in chunks]
|
| 65 |
+
|
| 66 |
+
# Remove embedding and filter None values (ChromaDB doesn't accept None)
|
| 67 |
+
cleaned_metadatas = []
|
| 68 |
+
for m in metadatas:
|
| 69 |
+
m.pop("embedding", None)
|
| 70 |
+
m.pop("content", None) # Already stored in documents
|
| 71 |
+
# Filter out None values - ChromaDB only accepts str, int, float, bool
|
| 72 |
+
cleaned = {k: v for k, v in m.items() if v is not None}
|
| 73 |
+
cleaned_metadatas.append(cleaned)
|
| 74 |
+
|
| 75 |
+
self.collection.add(
|
| 76 |
+
ids=ids,
|
| 77 |
+
embeddings=embeddings,
|
| 78 |
+
documents=documents,
|
| 79 |
+
metadatas=cleaned_metadatas,
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
logger.info("Chunks added to vector store", count=len(chunks))
|
| 83 |
+
return len(chunks)
|
| 84 |
+
|
| 85 |
+
def query(
|
| 86 |
+
self,
|
| 87 |
+
query_embedding: list[float],
|
| 88 |
+
repo_id: str,
|
| 89 |
+
top_k: int = 5,
|
| 90 |
+
similarity_threshold: float = 0.0,
|
| 91 |
+
) -> list[tuple[Chunk, float]]:
|
| 92 |
+
results = self.collection.query(
|
| 93 |
+
query_embeddings=[query_embedding],
|
| 94 |
+
n_results=top_k,
|
| 95 |
+
where={"repo_id": repo_id},
|
| 96 |
+
include=["documents", "metadatas", "distances"],
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
chunks_with_scores = []
|
| 100 |
+
if results["ids"] and results["ids"][0]:
|
| 101 |
+
for i, chunk_id in enumerate(results["ids"][0]):
|
| 102 |
+
# ChromaDB returns distances, convert to similarity for cosine
|
| 103 |
+
distance = results["distances"][0][i]
|
| 104 |
+
similarity = 1 - distance
|
| 105 |
+
|
| 106 |
+
if similarity >= similarity_threshold:
|
| 107 |
+
metadata = results["metadatas"][0][i]
|
| 108 |
+
metadata["id"] = chunk_id
|
| 109 |
+
metadata["content"] = results["documents"][0][i]
|
| 110 |
+
chunk = Chunk.from_dict(metadata)
|
| 111 |
+
chunks_with_scores.append((chunk, similarity))
|
| 112 |
+
|
| 113 |
+
return chunks_with_scores
|
| 114 |
+
|
| 115 |
+
def delete_repo_chunks(self, repo_id: str) -> int:
|
| 116 |
+
# Get all chunks for this repo
|
| 117 |
+
results = self.collection.get(where={"repo_id": repo_id}, include=[])
|
| 118 |
+
|
| 119 |
+
if results["ids"]:
|
| 120 |
+
self.collection.delete(ids=results["ids"])
|
| 121 |
+
count = len(results["ids"])
|
| 122 |
+
logger.info("Deleted repo chunks", repo_id=repo_id, count=count)
|
| 123 |
+
return count
|
| 124 |
+
return 0
|
| 125 |
+
|
| 126 |
+
def delete_file_chunks(self, repo_id: str, file_path: str) -> int:
|
| 127 |
+
"""Delete chunks for a specific file in a repository (for incremental updates)."""
|
| 128 |
+
results = self.collection.get(
|
| 129 |
+
where={"$and": [{"repo_id": repo_id}, {"file_path": file_path}]},
|
| 130 |
+
include=[],
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
if results["ids"]:
|
| 134 |
+
self.collection.delete(ids=results["ids"])
|
| 135 |
+
count = len(results["ids"])
|
| 136 |
+
logger.info("Deleted file chunks", repo_id=repo_id, file_path=file_path, count=count)
|
| 137 |
+
return count
|
| 138 |
+
return 0
|
| 139 |
+
|
| 140 |
+
def get_indexed_files(self, repo_id: str) -> set[str]:
|
| 141 |
+
"""Get set of file paths indexed for a repository."""
|
| 142 |
+
results = self.collection.get(
|
| 143 |
+
where={"repo_id": repo_id},
|
| 144 |
+
include=["metadatas"],
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
files = set()
|
| 148 |
+
if results["metadatas"]:
|
| 149 |
+
for metadata in results["metadatas"]:
|
| 150 |
+
if "file_path" in metadata:
|
| 151 |
+
files.add(metadata["file_path"])
|
| 152 |
+
return files
|
| 153 |
+
|
| 154 |
+
def get_repo_chunk_count(self, repo_id: str) -> int:
|
| 155 |
+
results = self.collection.get(where={"repo_id": repo_id}, include=[])
|
| 156 |
+
return len(results["ids"]) if results["ids"] else 0
|
| 157 |
+
|
| 158 |
+
def get_all_repo_ids(self) -> list[str]:
|
| 159 |
+
results = self.collection.get(include=["metadatas"])
|
| 160 |
+
repo_ids = set()
|
| 161 |
+
if results["metadatas"]:
|
| 162 |
+
for metadata in results["metadatas"]:
|
| 163 |
+
if "repo_id" in metadata:
|
| 164 |
+
repo_ids.add(metadata["repo_id"])
|
| 165 |
+
return list(repo_ids)
|
| 166 |
+
|
| 167 |
+
def clear(self) -> None:
|
| 168 |
+
self.client.delete_collection(self.collection_name)
|
| 169 |
+
self._collection = None
|
| 170 |
+
logger.info("Collection cleared", name=self.collection_name)
|
coderag/ingestion/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ingestion module: Repository loading, file filtering, and semantic chunking."""
|
| 2 |
+
|
| 3 |
+
from coderag.ingestion.validator import GitHubURLValidator
|
| 4 |
+
from coderag.ingestion.loader import RepositoryLoader
|
| 5 |
+
from coderag.ingestion.filter import FileFilter
|
| 6 |
+
from coderag.ingestion.chunker import CodeChunker
|
| 7 |
+
|
| 8 |
+
__all__ = ["GitHubURLValidator", "RepositoryLoader", "FileFilter", "CodeChunker"]
|
coderag/ingestion/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (566 Bytes). View file
|
|
|
coderag/ingestion/__pycache__/chunker.cpython-313.pyc
ADDED
|
Binary file (10.4 kB). View file
|
|
|
coderag/ingestion/__pycache__/filter.cpython-313.pyc
ADDED
|
Binary file (4.27 kB). View file
|
|
|
coderag/ingestion/__pycache__/loader.cpython-313.pyc
ADDED
|
Binary file (6.14 kB). View file
|
|
|
coderag/ingestion/__pycache__/validator.cpython-313.pyc
ADDED
|
Binary file (6.95 kB). View file
|
|
|
coderag/ingestion/chunker.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Code chunking with Tree-sitter and text fallback."""
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Iterator, Optional
|
| 6 |
+
|
| 7 |
+
from coderag.config import get_settings
|
| 8 |
+
from coderag.logging import get_logger
|
| 9 |
+
from coderag.models.chunk import Chunk, ChunkMetadata, ChunkType
|
| 10 |
+
from coderag.models.document import Document
|
| 11 |
+
|
| 12 |
+
logger = get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class ChunkerConfig:
|
| 17 |
+
"""Chunker configuration."""
|
| 18 |
+
chunk_size: int = 1500
|
| 19 |
+
chunk_overlap: int = 200
|
| 20 |
+
min_chunk_size: int = 50
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class CodeChunker:
|
| 24 |
+
"""Chunks code files into semantic units."""
|
| 25 |
+
|
| 26 |
+
def __init__(self, config: Optional[ChunkerConfig] = None) -> None:
|
| 27 |
+
settings = get_settings()
|
| 28 |
+
self.config = config or ChunkerConfig(
|
| 29 |
+
chunk_size=settings.ingestion.chunk_size,
|
| 30 |
+
chunk_overlap=settings.ingestion.chunk_overlap,
|
| 31 |
+
)
|
| 32 |
+
self._tree_sitter_available = self._check_tree_sitter()
|
| 33 |
+
|
| 34 |
+
def _check_tree_sitter(self) -> bool:
|
| 35 |
+
try:
|
| 36 |
+
import tree_sitter_python
|
| 37 |
+
return True
|
| 38 |
+
except ImportError:
|
| 39 |
+
logger.warning("Tree-sitter not available, using text chunking")
|
| 40 |
+
return False
|
| 41 |
+
|
| 42 |
+
def chunk_document(self, document: Document) -> Iterator[Chunk]:
|
| 43 |
+
if document.language == "python" and self._tree_sitter_available:
|
| 44 |
+
yield from self._chunk_python(document)
|
| 45 |
+
else:
|
| 46 |
+
yield from self._chunk_text(document)
|
| 47 |
+
|
| 48 |
+
def _chunk_python(self, document: Document) -> Iterator[Chunk]:
|
| 49 |
+
try:
|
| 50 |
+
import tree_sitter_python as tspython
|
| 51 |
+
from tree_sitter import Language, Parser
|
| 52 |
+
|
| 53 |
+
PY_LANGUAGE = Language(tspython.language())
|
| 54 |
+
parser = Parser(PY_LANGUAGE)
|
| 55 |
+
tree = parser.parse(bytes(document.content, "utf-8"))
|
| 56 |
+
|
| 57 |
+
yield from self._extract_python_chunks(tree.root_node, document)
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
logger.warning("Tree-sitter parsing failed, falling back to text", error=str(e))
|
| 61 |
+
yield from self._chunk_text(document)
|
| 62 |
+
|
| 63 |
+
def _extract_python_chunks(self, node, document: Document) -> Iterator[Chunk]:
|
| 64 |
+
lines = document.content.split("\n")
|
| 65 |
+
|
| 66 |
+
for child in node.children:
|
| 67 |
+
if child.type in ("function_definition", "async_function_definition"):
|
| 68 |
+
yield self._create_chunk_from_node(child, document, lines, ChunkType.FUNCTION)
|
| 69 |
+
elif child.type == "class_definition":
|
| 70 |
+
yield self._create_chunk_from_node(child, document, lines, ChunkType.CLASS)
|
| 71 |
+
# Also extract methods
|
| 72 |
+
for class_child in child.children:
|
| 73 |
+
if class_child.type == "block":
|
| 74 |
+
for block_child in class_child.children:
|
| 75 |
+
if block_child.type in ("function_definition", "async_function_definition"):
|
| 76 |
+
yield self._create_chunk_from_node(
|
| 77 |
+
block_child, document, lines, ChunkType.METHOD,
|
| 78 |
+
parent_name=self._get_node_name(child)
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# If no semantic chunks found, fall back to text chunking
|
| 82 |
+
if not any(child.type in ("function_definition", "class_definition", "async_function_definition")
|
| 83 |
+
for child in node.children):
|
| 84 |
+
yield from self._chunk_text(document)
|
| 85 |
+
|
| 86 |
+
def _create_chunk_from_node(
|
| 87 |
+
self,
|
| 88 |
+
node,
|
| 89 |
+
document: Document,
|
| 90 |
+
lines: list[str],
|
| 91 |
+
chunk_type: ChunkType,
|
| 92 |
+
parent_name: Optional[str] = None,
|
| 93 |
+
) -> Chunk:
|
| 94 |
+
start_line = node.start_point[0] + 1
|
| 95 |
+
end_line = node.end_point[0] + 1
|
| 96 |
+
content = "\n".join(lines[start_line - 1:end_line])
|
| 97 |
+
name = self._get_node_name(node)
|
| 98 |
+
signature = self._get_signature(node, lines)
|
| 99 |
+
docstring = self._get_docstring(node, lines)
|
| 100 |
+
|
| 101 |
+
metadata = ChunkMetadata(
|
| 102 |
+
file_path=document.file_path,
|
| 103 |
+
start_line=start_line,
|
| 104 |
+
end_line=end_line,
|
| 105 |
+
chunk_type=chunk_type,
|
| 106 |
+
language=document.language,
|
| 107 |
+
name=name,
|
| 108 |
+
signature=signature,
|
| 109 |
+
docstring=docstring,
|
| 110 |
+
parent_name=parent_name,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
return Chunk(content=content, metadata=metadata, repo_id=document.repo_id)
|
| 114 |
+
|
| 115 |
+
def _get_node_name(self, node) -> Optional[str]:
|
| 116 |
+
for child in node.children:
|
| 117 |
+
if child.type == "identifier":
|
| 118 |
+
return child.text.decode("utf-8")
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
def _get_signature(self, node, lines: list[str]) -> Optional[str]:
|
| 122 |
+
if node.type in ("function_definition", "async_function_definition"):
|
| 123 |
+
start_line = node.start_point[0]
|
| 124 |
+
return lines[start_line].strip()
|
| 125 |
+
return None
|
| 126 |
+
|
| 127 |
+
def _get_docstring(self, node, lines: list[str]) -> Optional[str]:
|
| 128 |
+
for child in node.children:
|
| 129 |
+
if child.type == "block":
|
| 130 |
+
for block_child in child.children:
|
| 131 |
+
if block_child.type == "expression_statement":
|
| 132 |
+
for expr_child in block_child.children:
|
| 133 |
+
if expr_child.type == "string":
|
| 134 |
+
return expr_child.text.decode("utf-8").strip('"""\'\'\'')
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
def _chunk_text(self, document: Document) -> Iterator[Chunk]:
|
| 138 |
+
lines = document.content.split("\n")
|
| 139 |
+
chunk_size = self.config.chunk_size
|
| 140 |
+
overlap = self.config.chunk_overlap
|
| 141 |
+
|
| 142 |
+
current_start = 0
|
| 143 |
+
while current_start < len(lines):
|
| 144 |
+
# Calculate chunk boundaries
|
| 145 |
+
char_count = 0
|
| 146 |
+
end_line = current_start
|
| 147 |
+
|
| 148 |
+
while end_line < len(lines) and char_count < chunk_size:
|
| 149 |
+
char_count += len(lines[end_line]) + 1
|
| 150 |
+
end_line += 1
|
| 151 |
+
|
| 152 |
+
content = "\n".join(lines[current_start:end_line])
|
| 153 |
+
|
| 154 |
+
if len(content.strip()) >= self.config.min_chunk_size:
|
| 155 |
+
metadata = ChunkMetadata(
|
| 156 |
+
file_path=document.file_path,
|
| 157 |
+
start_line=current_start + 1,
|
| 158 |
+
end_line=end_line,
|
| 159 |
+
chunk_type=ChunkType.TEXT,
|
| 160 |
+
language=document.language,
|
| 161 |
+
)
|
| 162 |
+
yield Chunk(content=content, metadata=metadata, repo_id=document.repo_id)
|
| 163 |
+
|
| 164 |
+
# Move start with overlap
|
| 165 |
+
overlap_lines = 0
|
| 166 |
+
overlap_chars = 0
|
| 167 |
+
while overlap_lines < end_line - current_start and overlap_chars < overlap:
|
| 168 |
+
overlap_chars += len(lines[end_line - 1 - overlap_lines]) + 1
|
| 169 |
+
overlap_lines += 1
|
| 170 |
+
|
| 171 |
+
current_start = end_line - overlap_lines
|
| 172 |
+
if current_start <= 0 or end_line >= len(lines):
|
| 173 |
+
break
|
| 174 |
+
|
| 175 |
+
def chunk_files(self, documents: Iterator[Document]) -> Iterator[Chunk]:
|
| 176 |
+
total_chunks = 0
|
| 177 |
+
for doc in documents:
|
| 178 |
+
doc_chunks = 0
|
| 179 |
+
for chunk in self.chunk_document(doc):
|
| 180 |
+
doc_chunks += 1
|
| 181 |
+
total_chunks += 1
|
| 182 |
+
yield chunk
|
| 183 |
+
logger.debug("Document chunked", file=doc.file_path, chunks=doc_chunks)
|
| 184 |
+
logger.info("Chunking complete", total_chunks=total_chunks)
|
coderag/ingestion/filter.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""File filtering for repository indexing."""
|
| 2 |
+
|
| 3 |
+
import fnmatch
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Iterator, Optional
|
| 6 |
+
|
| 7 |
+
from coderag.config import get_settings
|
| 8 |
+
from coderag.logging import get_logger
|
| 9 |
+
|
| 10 |
+
logger = get_logger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class FileFilter:
|
| 14 |
+
"""Filters files for indexing based on patterns."""
|
| 15 |
+
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
include_patterns: Optional[list[str]] = None,
|
| 19 |
+
exclude_patterns: Optional[list[str]] = None,
|
| 20 |
+
max_file_size_kb: Optional[int] = None,
|
| 21 |
+
) -> None:
|
| 22 |
+
settings = get_settings()
|
| 23 |
+
self.include_patterns = include_patterns or settings.ingestion.include_patterns
|
| 24 |
+
self.exclude_patterns = exclude_patterns or settings.ingestion.exclude_patterns
|
| 25 |
+
self.max_file_size = (max_file_size_kb or settings.ingestion.max_file_size_kb) * 1024
|
| 26 |
+
|
| 27 |
+
def should_include(self, file_path: Path, repo_root: Path) -> bool:
|
| 28 |
+
relative_path = str(file_path.relative_to(repo_root))
|
| 29 |
+
|
| 30 |
+
# Check exclusions first
|
| 31 |
+
for pattern in self.exclude_patterns:
|
| 32 |
+
if fnmatch.fnmatch(relative_path, pattern):
|
| 33 |
+
return False
|
| 34 |
+
if fnmatch.fnmatch(file_path.name, pattern):
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
# Check inclusions
|
| 38 |
+
for pattern in self.include_patterns:
|
| 39 |
+
if fnmatch.fnmatch(file_path.name, pattern):
|
| 40 |
+
return True
|
| 41 |
+
if fnmatch.fnmatch(relative_path, pattern):
|
| 42 |
+
return True
|
| 43 |
+
|
| 44 |
+
return False
|
| 45 |
+
|
| 46 |
+
def check_file_size(self, file_path: Path) -> bool:
|
| 47 |
+
try:
|
| 48 |
+
return file_path.stat().st_size <= self.max_file_size
|
| 49 |
+
except OSError:
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
def is_binary(self, file_path: Path) -> bool:
|
| 53 |
+
try:
|
| 54 |
+
with open(file_path, "rb") as f:
|
| 55 |
+
chunk = f.read(8192)
|
| 56 |
+
return b"\x00" in chunk
|
| 57 |
+
except (OSError, IOError):
|
| 58 |
+
return True
|
| 59 |
+
|
| 60 |
+
def filter_files(self, repo_root: Path) -> Iterator[Path]:
|
| 61 |
+
skipped_count = 0
|
| 62 |
+
included_count = 0
|
| 63 |
+
|
| 64 |
+
for file_path in repo_root.rglob("*"):
|
| 65 |
+
if not file_path.is_file():
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
if not self.should_include(file_path, repo_root):
|
| 69 |
+
skipped_count += 1
|
| 70 |
+
continue
|
| 71 |
+
|
| 72 |
+
if not self.check_file_size(file_path):
|
| 73 |
+
logger.debug("Skipping large file", path=str(file_path))
|
| 74 |
+
skipped_count += 1
|
| 75 |
+
continue
|
| 76 |
+
|
| 77 |
+
if self.is_binary(file_path):
|
| 78 |
+
logger.debug("Skipping binary file", path=str(file_path))
|
| 79 |
+
skipped_count += 1
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
included_count += 1
|
| 83 |
+
yield file_path
|
| 84 |
+
|
| 85 |
+
logger.info("File filtering complete", included=included_count, skipped=skipped_count)
|
coderag/ingestion/loader.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Repository loading and cloning."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Callable, Optional
|
| 5 |
+
|
| 6 |
+
from git import Repo, GitCommandError
|
| 7 |
+
|
| 8 |
+
from coderag.config import get_settings
|
| 9 |
+
from coderag.logging import get_logger
|
| 10 |
+
from coderag.ingestion.validator import GitHubRepoInfo
|
| 11 |
+
|
| 12 |
+
logger = get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
ProgressCallback = Callable[[str, int], None]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class LoaderError(Exception):
|
| 18 |
+
"""Repository loading error."""
|
| 19 |
+
pass
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class RepositoryLoader:
|
| 23 |
+
"""Loads repositories from GitHub."""
|
| 24 |
+
|
| 25 |
+
def __init__(self, cache_dir: Optional[Path] = None) -> None:
|
| 26 |
+
settings = get_settings()
|
| 27 |
+
self.cache_dir = cache_dir or settings.ingestion.repos_cache_dir
|
| 28 |
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
def get_repo_path(self, repo_info: GitHubRepoInfo) -> Path:
|
| 31 |
+
return self.cache_dir / repo_info.owner / repo_info.name
|
| 32 |
+
|
| 33 |
+
def clone_repository(
|
| 34 |
+
self,
|
| 35 |
+
repo_info: GitHubRepoInfo,
|
| 36 |
+
branch: Optional[str] = None,
|
| 37 |
+
progress_callback: Optional[ProgressCallback] = None,
|
| 38 |
+
) -> Path:
|
| 39 |
+
repo_path = self.get_repo_path(repo_info)
|
| 40 |
+
|
| 41 |
+
# Try branches in order: specified, repo default, main, master
|
| 42 |
+
branches_to_try = []
|
| 43 |
+
if branch:
|
| 44 |
+
branches_to_try.append(branch)
|
| 45 |
+
if repo_info.branch and repo_info.branch not in branches_to_try:
|
| 46 |
+
branches_to_try.append(repo_info.branch)
|
| 47 |
+
if "main" not in branches_to_try:
|
| 48 |
+
branches_to_try.append("main")
|
| 49 |
+
if "master" not in branches_to_try:
|
| 50 |
+
branches_to_try.append("master")
|
| 51 |
+
|
| 52 |
+
if repo_path.exists():
|
| 53 |
+
logger.info("Repository exists, updating", path=str(repo_path))
|
| 54 |
+
return self._update_repository(repo_path, branches_to_try[0], progress_callback)
|
| 55 |
+
|
| 56 |
+
if progress_callback:
|
| 57 |
+
progress_callback("Cloning repository", 0)
|
| 58 |
+
|
| 59 |
+
repo_path.parent.mkdir(parents=True, exist_ok=True)
|
| 60 |
+
|
| 61 |
+
last_error = None
|
| 62 |
+
for try_branch in branches_to_try:
|
| 63 |
+
try:
|
| 64 |
+
logger.info("Trying to clone", url=repo_info.clone_url, branch=try_branch)
|
| 65 |
+
Repo.clone_from(
|
| 66 |
+
repo_info.clone_url,
|
| 67 |
+
repo_path,
|
| 68 |
+
branch=try_branch,
|
| 69 |
+
depth=1,
|
| 70 |
+
single_branch=True,
|
| 71 |
+
)
|
| 72 |
+
if progress_callback:
|
| 73 |
+
progress_callback("Clone complete", 100)
|
| 74 |
+
logger.info("Repository cloned", path=str(repo_path), branch=try_branch)
|
| 75 |
+
return repo_path
|
| 76 |
+
except GitCommandError as e:
|
| 77 |
+
last_error = e
|
| 78 |
+
logger.debug("Branch not found, trying next", branch=try_branch)
|
| 79 |
+
# Clean up partial clone if any
|
| 80 |
+
import shutil
|
| 81 |
+
shutil.rmtree(repo_path, ignore_errors=True)
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
raise LoaderError(f"Failed to clone repository (tried branches: {branches_to_try}): {last_error}")
|
| 85 |
+
|
| 86 |
+
def _update_repository(
|
| 87 |
+
self,
|
| 88 |
+
repo_path: Path,
|
| 89 |
+
branch: str,
|
| 90 |
+
progress_callback: Optional[ProgressCallback] = None,
|
| 91 |
+
) -> Path:
|
| 92 |
+
try:
|
| 93 |
+
repo = Repo(repo_path)
|
| 94 |
+
if progress_callback:
|
| 95 |
+
progress_callback("Fetching updates", 30)
|
| 96 |
+
repo.remotes.origin.fetch()
|
| 97 |
+
repo.git.checkout(branch)
|
| 98 |
+
repo.remotes.origin.pull()
|
| 99 |
+
if progress_callback:
|
| 100 |
+
progress_callback("Update complete", 100)
|
| 101 |
+
logger.info("Repository updated", path=str(repo_path))
|
| 102 |
+
return repo_path
|
| 103 |
+
except GitCommandError as e:
|
| 104 |
+
logger.warning("Update failed, re-cloning", error=str(e))
|
| 105 |
+
import shutil
|
| 106 |
+
shutil.rmtree(repo_path, ignore_errors=True)
|
| 107 |
+
raise LoaderError(f"Failed to update, please re-clone: {e}")
|
| 108 |
+
|
| 109 |
+
def is_cached(self, repo_info: GitHubRepoInfo) -> bool:
|
| 110 |
+
return self.get_repo_path(repo_info).exists()
|
| 111 |
+
|
| 112 |
+
def delete_cache(self, repo_info: GitHubRepoInfo) -> None:
|
| 113 |
+
repo_path = self.get_repo_path(repo_info)
|
| 114 |
+
if repo_path.exists():
|
| 115 |
+
import shutil
|
| 116 |
+
shutil.rmtree(repo_path)
|
| 117 |
+
logger.info("Cache deleted", path=str(repo_path))
|
coderag/ingestion/validator.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""GitHub URL validation and parsing."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
import httpx
|
| 8 |
+
|
| 9 |
+
from coderag.logging import get_logger
|
| 10 |
+
|
| 11 |
+
logger = get_logger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class GitHubRepoInfo:
|
| 16 |
+
"""Parsed GitHub repository information."""
|
| 17 |
+
|
| 18 |
+
owner: str
|
| 19 |
+
name: str
|
| 20 |
+
url: str
|
| 21 |
+
branch: Optional[str] = None
|
| 22 |
+
|
| 23 |
+
@property
|
| 24 |
+
def full_name(self) -> str:
|
| 25 |
+
return f"{self.owner}/{self.name}"
|
| 26 |
+
|
| 27 |
+
@property
|
| 28 |
+
def clone_url(self) -> str:
|
| 29 |
+
return f"https://github.com/{self.owner}/{self.name}.git"
|
| 30 |
+
|
| 31 |
+
@property
|
| 32 |
+
def api_url(self) -> str:
|
| 33 |
+
return f"https://api.github.com/repos/{self.owner}/{self.name}"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ValidationError(Exception):
|
| 37 |
+
"""URL validation error."""
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class GitHubURLValidator:
|
| 42 |
+
"""Validates and parses GitHub repository URLs."""
|
| 43 |
+
|
| 44 |
+
GITHUB_PATTERNS = [
|
| 45 |
+
r"^https?://github\.com/(?P<owner>[^/]+)/(?P<name>[^/]+?)(?:\.git)?/?$",
|
| 46 |
+
r"^git@github\.com:(?P<owner>[^/]+)/(?P<name>[^/]+?)(?:\.git)?$",
|
| 47 |
+
r"^(?P<owner>[a-zA-Z0-9](?:[a-zA-Z0-9]|-(?=[a-zA-Z0-9])){0,38})/(?P<name>[a-zA-Z0-9._-]+)$",
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
def __init__(self, timeout: float = 10.0) -> None:
|
| 51 |
+
self.timeout = timeout
|
| 52 |
+
self._patterns = [re.compile(p) for p in self.GITHUB_PATTERNS]
|
| 53 |
+
|
| 54 |
+
def parse_url(self, url: str) -> GitHubRepoInfo:
|
| 55 |
+
url = url.strip()
|
| 56 |
+
for pattern in self._patterns:
|
| 57 |
+
match = pattern.match(url)
|
| 58 |
+
if match:
|
| 59 |
+
owner = match.group("owner")
|
| 60 |
+
name = match.group("name").rstrip(".git")
|
| 61 |
+
if not self._is_valid_name(owner) or not self._is_valid_name(name):
|
| 62 |
+
raise ValidationError(f"Invalid owner or repository name: {url}")
|
| 63 |
+
return GitHubRepoInfo(owner=owner, name=name, url=f"https://github.com/{owner}/{name}")
|
| 64 |
+
raise ValidationError(f"Invalid GitHub URL: {url}. Expected: https://github.com/owner/repo")
|
| 65 |
+
|
| 66 |
+
def _is_valid_name(self, name: str) -> bool:
|
| 67 |
+
if not name or len(name) > 100:
|
| 68 |
+
return False
|
| 69 |
+
return bool(re.match(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$", name))
|
| 70 |
+
|
| 71 |
+
async def validate_repository(self, url: str, check_accessibility: bool = True) -> GitHubRepoInfo:
|
| 72 |
+
repo_info = self.parse_url(url)
|
| 73 |
+
if check_accessibility:
|
| 74 |
+
await self._check_repo_accessible(repo_info)
|
| 75 |
+
logger.info("Repository validated", owner=repo_info.owner, name=repo_info.name)
|
| 76 |
+
return repo_info
|
| 77 |
+
|
| 78 |
+
async def _check_repo_accessible(self, repo_info: GitHubRepoInfo) -> None:
|
| 79 |
+
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
| 80 |
+
try:
|
| 81 |
+
response = await client.get(repo_info.api_url)
|
| 82 |
+
if response.status_code == 404:
|
| 83 |
+
raise ValidationError(f"Repository not found: {repo_info.full_name}")
|
| 84 |
+
elif response.status_code == 403:
|
| 85 |
+
raise ValidationError(f"Access denied: {repo_info.full_name}")
|
| 86 |
+
elif response.status_code != 200:
|
| 87 |
+
raise ValidationError(f"HTTP error {response.status_code}: {repo_info.full_name}")
|
| 88 |
+
data = response.json()
|
| 89 |
+
if data.get("private", False):
|
| 90 |
+
raise ValidationError(f"Private repository not supported: {repo_info.full_name}")
|
| 91 |
+
repo_info.branch = data.get("default_branch", "main")
|
| 92 |
+
except httpx.TimeoutException:
|
| 93 |
+
raise ValidationError(f"Timeout checking repository: {repo_info.full_name}")
|
| 94 |
+
except httpx.RequestError as e:
|
| 95 |
+
raise ValidationError(f"Network error: {str(e)}")
|
| 96 |
+
|
| 97 |
+
def validate_url_sync(self, url: str) -> GitHubRepoInfo:
|
| 98 |
+
return self.parse_url(url)
|
coderag/logging.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Structured logging configuration using structlog."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import sys
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
import structlog
|
| 8 |
+
from structlog.types import Processor
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def setup_logging(
|
| 12 |
+
level: str = "INFO",
|
| 13 |
+
json_format: bool = False,
|
| 14 |
+
log_file: str | None = None,
|
| 15 |
+
) -> None:
|
| 16 |
+
"""Configure structured logging for the application.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
| 20 |
+
json_format: If True, output logs as JSON (for production)
|
| 21 |
+
log_file: Optional file path for logging output
|
| 22 |
+
"""
|
| 23 |
+
# Configure standard library logging
|
| 24 |
+
logging.basicConfig(
|
| 25 |
+
format="%(message)s",
|
| 26 |
+
stream=sys.stdout,
|
| 27 |
+
level=getattr(logging, level.upper()),
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Add file handler if specified
|
| 31 |
+
if log_file:
|
| 32 |
+
file_handler = logging.FileHandler(log_file)
|
| 33 |
+
file_handler.setLevel(getattr(logging, level.upper()))
|
| 34 |
+
logging.getLogger().addHandler(file_handler)
|
| 35 |
+
|
| 36 |
+
# Shared processors for all outputs
|
| 37 |
+
shared_processors: list[Processor] = [
|
| 38 |
+
structlog.contextvars.merge_contextvars,
|
| 39 |
+
structlog.stdlib.add_log_level,
|
| 40 |
+
structlog.stdlib.add_logger_name,
|
| 41 |
+
structlog.stdlib.PositionalArgumentsFormatter(),
|
| 42 |
+
structlog.processors.TimeStamper(fmt="iso"),
|
| 43 |
+
structlog.processors.StackInfoRenderer(),
|
| 44 |
+
structlog.processors.UnicodeDecoder(),
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
if json_format:
|
| 48 |
+
# Production: JSON output
|
| 49 |
+
processors: list[Processor] = [
|
| 50 |
+
*shared_processors,
|
| 51 |
+
structlog.processors.format_exc_info,
|
| 52 |
+
structlog.processors.JSONRenderer(),
|
| 53 |
+
]
|
| 54 |
+
else:
|
| 55 |
+
# Development: Colored console output
|
| 56 |
+
processors = [
|
| 57 |
+
*shared_processors,
|
| 58 |
+
structlog.dev.ConsoleRenderer(colors=True),
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
structlog.configure(
|
| 62 |
+
processors=processors,
|
| 63 |
+
wrapper_class=structlog.stdlib.BoundLogger,
|
| 64 |
+
context_class=dict,
|
| 65 |
+
logger_factory=structlog.stdlib.LoggerFactory(),
|
| 66 |
+
cache_logger_on_first_use=True,
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def get_logger(name: str | None = None) -> structlog.stdlib.BoundLogger:
|
| 71 |
+
"""Get a structured logger instance.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
name: Logger name (usually __name__ of the calling module)
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
Configured structlog logger
|
| 78 |
+
"""
|
| 79 |
+
return structlog.get_logger(name)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class LogContext:
|
| 83 |
+
"""Context manager for adding temporary context to logs."""
|
| 84 |
+
|
| 85 |
+
def __init__(self, **kwargs: Any) -> None:
|
| 86 |
+
"""Initialize with context variables."""
|
| 87 |
+
self.context = kwargs
|
| 88 |
+
self._token: Any = None
|
| 89 |
+
|
| 90 |
+
def __enter__(self) -> "LogContext":
|
| 91 |
+
"""Bind context variables."""
|
| 92 |
+
self._token = structlog.contextvars.bind_contextvars(**self.context)
|
| 93 |
+
return self
|
| 94 |
+
|
| 95 |
+
def __exit__(self, *args: Any) -> None:
|
| 96 |
+
"""Unbind context variables."""
|
| 97 |
+
structlog.contextvars.unbind_contextvars(*self.context.keys())
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def log_operation(
|
| 101 |
+
operation: str,
|
| 102 |
+
**kwargs: Any,
|
| 103 |
+
) -> LogContext:
|
| 104 |
+
"""Create a logging context for an operation.
|
| 105 |
+
|
| 106 |
+
Usage:
|
| 107 |
+
with log_operation("indexing", repo_id="123"):
|
| 108 |
+
# All logs within this block will include repo_id
|
| 109 |
+
logger.info("Starting indexing")
|
| 110 |
+
"""
|
| 111 |
+
return LogContext(operation=operation, **kwargs)
|
coderag/main.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CodeRAG main application entry point."""
|
| 2 |
+
|
| 3 |
+
from contextlib import asynccontextmanager
|
| 4 |
+
|
| 5 |
+
import uvicorn
|
| 6 |
+
from fastapi import FastAPI
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
|
| 9 |
+
from coderag.config import get_settings
|
| 10 |
+
from coderag.logging import setup_logging, get_logger
|
| 11 |
+
|
| 12 |
+
# Initialize settings and logging
|
| 13 |
+
settings = get_settings()
|
| 14 |
+
setup_logging(level=settings.server.log_level.upper())
|
| 15 |
+
logger = get_logger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@asynccontextmanager
|
| 19 |
+
async def lifespan(app: FastAPI):
|
| 20 |
+
"""Application lifespan handler."""
|
| 21 |
+
logger.info(
|
| 22 |
+
"Starting CodeRAG",
|
| 23 |
+
app_name=settings.app_name,
|
| 24 |
+
version=settings.app_version,
|
| 25 |
+
debug=settings.debug,
|
| 26 |
+
)
|
| 27 |
+
yield
|
| 28 |
+
logger.info("Shutting down CodeRAG")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def create_app() -> FastAPI:
|
| 32 |
+
"""Create and configure the FastAPI application."""
|
| 33 |
+
app = FastAPI(
|
| 34 |
+
title=settings.app_name,
|
| 35 |
+
version=settings.app_version,
|
| 36 |
+
description="RAG-based Q&A system for code repositories with verifiable citations",
|
| 37 |
+
docs_url="/docs",
|
| 38 |
+
redoc_url="/redoc",
|
| 39 |
+
lifespan=lifespan,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# CORS middleware
|
| 43 |
+
app.add_middleware(
|
| 44 |
+
CORSMiddleware,
|
| 45 |
+
allow_origins=["*"],
|
| 46 |
+
allow_credentials=True,
|
| 47 |
+
allow_methods=["*"],
|
| 48 |
+
allow_headers=["*"],
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Health check endpoint
|
| 52 |
+
@app.get("/health")
|
| 53 |
+
async def health_check() -> dict:
|
| 54 |
+
"""Health check endpoint."""
|
| 55 |
+
return {
|
| 56 |
+
"status": "healthy",
|
| 57 |
+
"app": settings.app_name,
|
| 58 |
+
"version": settings.app_version,
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# Register API routes
|
| 62 |
+
from coderag.api.routes import router as api_router
|
| 63 |
+
|
| 64 |
+
app.include_router(api_router, prefix="/api/v1")
|
| 65 |
+
|
| 66 |
+
# Mount MCP server
|
| 67 |
+
try:
|
| 68 |
+
from coderag.mcp.server import create_mcp_server
|
| 69 |
+
|
| 70 |
+
mcp_server = create_mcp_server()
|
| 71 |
+
mcp_app = mcp_server.streamable_http_app()
|
| 72 |
+
app.mount("/mcp", mcp_app)
|
| 73 |
+
logger.info("MCP server mounted at /mcp")
|
| 74 |
+
except ImportError as e:
|
| 75 |
+
logger.warning("MCP server not available", error=str(e))
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error("Failed to mount MCP server", error=str(e))
|
| 78 |
+
|
| 79 |
+
# Mount Gradio UI
|
| 80 |
+
try:
|
| 81 |
+
from coderag.ui.app import create_gradio_app
|
| 82 |
+
import gradio as gr
|
| 83 |
+
|
| 84 |
+
gradio_app = create_gradio_app()
|
| 85 |
+
app = gr.mount_gradio_app(app, gradio_app, path="/")
|
| 86 |
+
logger.info("Gradio UI mounted at /")
|
| 87 |
+
except ImportError as e:
|
| 88 |
+
logger.warning("Gradio UI not available", error=str(e))
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.error("Failed to mount Gradio UI", error=str(e))
|
| 91 |
+
|
| 92 |
+
return app
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def main() -> None:
|
| 96 |
+
"""Run the application."""
|
| 97 |
+
app = create_app()
|
| 98 |
+
|
| 99 |
+
logger.info(
|
| 100 |
+
"Starting server",
|
| 101 |
+
host=settings.server.host,
|
| 102 |
+
port=settings.server.port,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
uvicorn.run(
|
| 106 |
+
app,
|
| 107 |
+
host=settings.server.host,
|
| 108 |
+
port=settings.server.port,
|
| 109 |
+
reload=settings.server.reload,
|
| 110 |
+
workers=settings.server.workers,
|
| 111 |
+
log_level=settings.server.log_level,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
try:
|
| 117 |
+
main()
|
| 118 |
+
except KeyboardInterrupt:
|
| 119 |
+
logger.info("Application interrupted by user")
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.error("Application crashed", error=str(e), exc_info=True)
|
| 122 |
+
import traceback
|
| 123 |
+
print("\n" + "="*80)
|
| 124 |
+
print("FATAL ERROR:")
|
| 125 |
+
print("="*80)
|
| 126 |
+
traceback.print_exc()
|
| 127 |
+
print("="*80)
|
| 128 |
+
input("Press Enter to close...") # Keep terminal open
|
coderag/mcp/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""MCP (Model Context Protocol) server for CodeRAG."""
|
| 2 |
+
|
| 3 |
+
from coderag.mcp.handlers import MCPHandlers, get_mcp_handlers
|
| 4 |
+
from coderag.mcp.server import create_mcp_server, mcp
|
| 5 |
+
|
| 6 |
+
__all__ = [
|
| 7 |
+
"MCPHandlers",
|
| 8 |
+
"get_mcp_handlers",
|
| 9 |
+
"create_mcp_server",
|
| 10 |
+
"mcp",
|
| 11 |
+
]
|
coderag/mcp/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (434 Bytes). View file
|
|
|
coderag/mcp/__pycache__/cli.cpython-313.pyc
ADDED
|
Binary file (1.23 kB). View file
|
|
|
coderag/mcp/__pycache__/handlers.cpython-313.pyc
ADDED
|
Binary file (23.9 kB). View file
|
|
|
coderag/mcp/__pycache__/prompts.cpython-313.pyc
ADDED
|
Binary file (4.72 kB). View file
|
|
|
coderag/mcp/__pycache__/resources.cpython-313.pyc
ADDED
|
Binary file (1.56 kB). View file
|
|
|
coderag/mcp/__pycache__/server.cpython-313.pyc
ADDED
|
Binary file (1.52 kB). View file
|
|
|
coderag/mcp/__pycache__/tools.cpython-313.pyc
ADDED
|
Binary file (5.03 kB). View file
|
|
|
coderag/mcp/cli.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI entry point for running MCP server in stdio mode."""
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
# Suppress all stdout output except MCP protocol
|
| 7 |
+
os.environ["PYTHONUNBUFFERED"] = "1"
|
| 8 |
+
|
| 9 |
+
# Redirect any stray prints to stderr
|
| 10 |
+
import io
|
| 11 |
+
_original_stdout = sys.stdout
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def main():
|
| 15 |
+
"""Run the MCP server in stdio mode for Claude Desktop."""
|
| 16 |
+
# Suppress logging to stdout - redirect to stderr
|
| 17 |
+
import logging
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=logging.WARNING,
|
| 20 |
+
stream=sys.stderr,
|
| 21 |
+
format="%(message)s"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Suppress structlog output
|
| 25 |
+
import structlog
|
| 26 |
+
structlog.configure(
|
| 27 |
+
wrapper_class=structlog.make_filtering_bound_logger(logging.CRITICAL),
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
from coderag.mcp.server import create_mcp_server
|
| 31 |
+
|
| 32 |
+
mcp = create_mcp_server()
|
| 33 |
+
mcp.run(transport="stdio")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
if __name__ == "__main__":
|
| 37 |
+
main()
|