Spaces:

Sebastiangmz
/

CodeRAG

Running

App Files Files Community

Sebastiangmz commited on 9 days ago

Commit

42f5b98

1 Parent(s): 1e19152

Update to v0.1.2

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +0 -7
coderag/__init__.py +3 -0
coderag/__pycache__/__init__.cpython-313.pyc +0 -0
coderag/__pycache__/cli.cpython-313.pyc +0 -0
coderag/__pycache__/config.cpython-313.pyc +0 -0
coderag/__pycache__/logging.cpython-313.pyc +0 -0
coderag/__pycache__/main.cpython-313.pyc +0 -0
coderag/api/__init__.py +5 -0
coderag/api/__pycache__/__init__.cpython-313.pyc +0 -0
coderag/api/__pycache__/routes.cpython-313.pyc +0 -0
coderag/api/__pycache__/schemas.cpython-313.pyc +0 -0
coderag/api/routes.py +310 -0
coderag/api/schemas.py +101 -0
coderag/cli.py +675 -0
coderag/config.py +154 -0
coderag/generation/__init__.py +7 -0
coderag/generation/__pycache__/__init__.cpython-313.pyc +0 -0
coderag/generation/__pycache__/citations.cpython-313.pyc +0 -0
coderag/generation/__pycache__/generator.cpython-313.pyc +0 -0
coderag/generation/__pycache__/prompts.cpython-313.pyc +0 -0
coderag/generation/citations.py +77 -0
coderag/generation/generator.py +241 -0
coderag/generation/prompts.py +72 -0
coderag/indexing/__init__.py +6 -0
coderag/indexing/__pycache__/__init__.cpython-313.pyc +0 -0
coderag/indexing/__pycache__/embeddings.cpython-313.pyc +0 -0
coderag/indexing/__pycache__/vectorstore.cpython-313.pyc +0 -0
coderag/indexing/embeddings.py +147 -0
coderag/indexing/vectorstore.py +170 -0
coderag/ingestion/__init__.py +8 -0
coderag/ingestion/__pycache__/__init__.cpython-313.pyc +0 -0
coderag/ingestion/__pycache__/chunker.cpython-313.pyc +0 -0
coderag/ingestion/__pycache__/filter.cpython-313.pyc +0 -0
coderag/ingestion/__pycache__/loader.cpython-313.pyc +0 -0
coderag/ingestion/__pycache__/validator.cpython-313.pyc +0 -0
coderag/ingestion/chunker.py +184 -0
coderag/ingestion/filter.py +85 -0
coderag/ingestion/loader.py +117 -0
coderag/ingestion/validator.py +98 -0
coderag/logging.py +111 -0
coderag/main.py +128 -0
coderag/mcp/__init__.py +11 -0
coderag/mcp/__pycache__/__init__.cpython-313.pyc +0 -0
coderag/mcp/__pycache__/cli.cpython-313.pyc +0 -0
coderag/mcp/__pycache__/handlers.cpython-313.pyc +0 -0
coderag/mcp/__pycache__/prompts.cpython-313.pyc +0 -0
coderag/mcp/__pycache__/resources.cpython-313.pyc +0 -0
coderag/mcp/__pycache__/server.cpython-313.pyc +0 -0
coderag/mcp/__pycache__/tools.cpython-313.pyc +0 -0
coderag/mcp/cli.py +37 -0

app.py CHANGED Viewed

@@ -5,13 +5,6 @@ It's configured to work without GPU (embeddings on CPU, LLM via Groq).
 """
 import os
-import sys
-from pathlib import Path
-# Add src to path for HF Spaces (no pip install -e . available)
-src_path = Path(__file__).parent / "src"
-if src_path.exists():
-    sys.path.insert(0, str(src_path))
 # Configure for HF Spaces environment
 os.environ.setdefault("MODEL_LLM_PROVIDER", "groq")

 """
 import os
 # Configure for HF Spaces environment
 os.environ.setdefault("MODEL_LLM_PROVIDER", "groq")

coderag/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """CodeRAG: RAG-based Q&A system for code repositories with verifiable citations."""
2	+
3	+ __version__ = "0.1.0"

coderag/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (279 Bytes). View file

coderag/__pycache__/cli.cpython-313.pyc ADDED Viewed

Binary file (33.6 kB). View file

coderag/__pycache__/config.cpython-313.pyc ADDED Viewed

Binary file (6.4 kB). View file

coderag/__pycache__/logging.cpython-313.pyc ADDED Viewed

Binary file (5.03 kB). View file

coderag/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (5.61 kB). View file

coderag/api/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""API module: REST endpoints for programmatic access."""
+from coderag.api.routes import router
+__all__ = ["router"]

coderag/api/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (309 Bytes). View file

coderag/api/__pycache__/routes.cpython-313.pyc ADDED Viewed

Binary file (12.9 kB). View file

coderag/api/__pycache__/schemas.cpython-313.pyc ADDED Viewed

Binary file (5.38 kB). View file

coderag/api/routes.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""REST API routes."""
+import json
+from datetime import datetime
+from typing import Optional
+from fastapi import APIRouter, HTTPException, BackgroundTasks
+from fastapi.responses import JSONResponse
+from coderag.api.schemas import (
+    IndexRepositoryRequest,
+    IndexRepositoryResponse,
+    QueryRequest,
+    QueryResponse,
+    ListRepositoriesResponse,
+    RepositoryInfo,
+    CitationResponse,
+    RetrievedChunkResponse,
+    ErrorResponse,
+)
+from coderag.config import get_settings
+from coderag.generation.generator import ResponseGenerator
+from coderag.indexing.embeddings import EmbeddingGenerator
+from coderag.indexing.vectorstore import VectorStore
+from coderag.ingestion.chunker import CodeChunker
+from coderag.ingestion.filter import FileFilter
+from coderag.ingestion.loader import RepositoryLoader
+from coderag.ingestion.validator import GitHubURLValidator, ValidationError
+from coderag.logging import get_logger
+from coderag.models.document import Document
+from coderag.models.query import Query as QueryModel
+from coderag.models.repository import Repository, RepositoryStatus
+logger = get_logger(__name__)
+router = APIRouter()
+# Global state (in production, use a proper database)
+settings = get_settings()
+repos_file = settings.data_dir / "repositories.json"
+repositories: dict[str, Repository] = {}
+def load_repositories() -> None:
+    """Load repositories from disk."""
+    global repositories
+    if repos_file.exists():
+        try:
+            data = json.loads(repos_file.read_text())
+            repositories = {r["id"]: Repository.from_dict(r) for r in data}
+        except Exception as e:
+            logger.error("Failed to load repositories", error=str(e))
+def save_repositories() -> None:
+    """Save repositories to disk."""
+    repos_file.parent.mkdir(parents=True, exist_ok=True)
+    data = [r.to_dict() for r in repositories.values()]
+    repos_file.write_text(json.dumps(data, indent=2))
+# Load on startup
+load_repositories()
+def resolve_repo_id(partial_id: str) -> Optional[str]:
+    """Resolve a partial repository ID to a full ID.
+    Supports both full UUIDs and partial IDs (first 8+ characters).
+    Returns None if no match or multiple matches found.
+    """
+    # First try exact match
+    if partial_id in repositories:
+        return partial_id
+    # Try prefix match (minimum 8 characters recommended)
+    matches = [rid for rid in repositories.keys() if rid.startswith(partial_id)]
+    if len(matches) == 1:
+        return matches[0]
+    return None
+def get_repo_or_404(repo_id: str) -> Repository:
+    """Get a repository by ID (full or partial), raising 404 if not found."""
+    full_id = resolve_repo_id(repo_id)
+    if full_id is None:
+        raise HTTPException(status_code=404, detail="Repository not found")
+    return repositories[full_id]
+async def index_repository_task(
+    url: str,
+    repo_id: str,
+    branch: Optional[str],
+    include_patterns: Optional[list[str]],
+    exclude_patterns: Optional[list[str]],
+) -> None:
+    """Background task to index a repository."""
+    repo = repositories[repo_id]
+    try:
+        # Validate and clone
+        validator = GitHubURLValidator()
+        repo_info = await validator.validate_repository(url)
+        branch = branch or repo_info.branch or "main"
+        loader = RepositoryLoader()
+        repo_path = loader.clone_repository(repo_info, branch)
+        repo.clone_path = repo_path
+        repo.status = RepositoryStatus.INDEXING
+        save_repositories()
+        # Filter files
+        file_filter = FileFilter(
+            include_patterns=include_patterns,
+            exclude_patterns=exclude_patterns,
+        )
+        files = list(file_filter.filter_files(repo_path))
+        # Load documents
+        documents = []
+        for file_path in files:
+            try:
+                doc = Document.from_file(file_path, repo_path, repo.id)
+                documents.append(doc)
+            except Exception as e:
+                logger.warning("Failed to load file", path=str(file_path), error=str(e))
+        # Chunk
+        chunker = CodeChunker()
+        chunks = []
+        for doc in documents:
+            for chunk in chunker.chunk_document(doc):
+                chunks.append(chunk)
+        # Embed and store
+        if chunks:
+            vectorstore = VectorStore()
+            vectorstore.delete_repo_chunks(repo.id)
+            embedder = EmbeddingGenerator()
+            embedded_chunks = embedder.embed_chunks(chunks)
+            vectorstore.add_chunks(embedded_chunks)
+        # Update status
+        repo.chunk_count = len(chunks)
+        repo.indexed_at = datetime.now()
+        repo.status = RepositoryStatus.READY
+        save_repositories()
+        logger.info("Repository indexed", repo_id=repo_id, chunks=len(chunks))
+    except Exception as e:
+        logger.error("Indexing failed", repo_id=repo_id, error=str(e))
+        repo.status = RepositoryStatus.ERROR
+        repo.error_message = str(e)
+        save_repositories()
+@router.post("/repos/index", response_model=IndexRepositoryResponse, status_code=202)
+async def index_repository(
+    request: IndexRepositoryRequest,
+    background_tasks: BackgroundTasks,
+) -> IndexRepositoryResponse:
+    """Index a GitHub repository."""
+    # Create repository record
+    repo = Repository(
+        url=request.url,
+        branch=request.branch or "main",
+        status=RepositoryStatus.PENDING,
+    )
+    repositories[repo.id] = repo
+    save_repositories()
+    # Start background indexing
+    background_tasks.add_task(
+        index_repository_task,
+        request.url,
+        repo.id,
+        request.branch,
+        request.include_patterns,
+        request.exclude_patterns,
+    )
+    return IndexRepositoryResponse(
+        repo_id=repo.id,
+        status=repo.status.value,
+        message="Repository indexing started",
+    )
+@router.post("/query", response_model=QueryResponse)
+async def query_repository(request: QueryRequest) -> QueryResponse:
+    """Query a repository.
+    Supports both full repository IDs and partial IDs (first 8+ characters).
+    """
+    # Check repository exists (supports partial IDs)
+    repo = get_repo_or_404(request.repo_id)
+    if repo.status != RepositoryStatus.READY:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Repository not ready (status: {repo.status.value})",
+        )
+    try:
+        # Generate response (use resolved repo.id for consistency)
+        generator = ResponseGenerator()
+        query = QueryModel(
+            question=request.question,
+            repo_id=repo.id,  # Use resolved full ID
+            top_k=request.top_k,
+        )
+        response = generator.generate(query)
+        # Convert to API schema
+        return QueryResponse(
+            answer=response.answer,
+            citations=[
+                CitationResponse(
+                    file_path=c.file_path,
+                    start_line=c.start_line,
+                    end_line=c.end_line,
+                )
+                for c in response.citations
+            ],
+            retrieved_chunks=[
+                RetrievedChunkResponse(
+                    chunk_id=c.chunk_id,
+                    file_path=c.file_path,
+                    start_line=c.start_line,
+                    end_line=c.end_line,
+                    relevance_score=c.relevance_score,
+                    chunk_type=c.chunk_type,
+                    name=c.name,
+                    content=c.content,
+                )
+                for c in response.retrieved_chunks
+            ],
+            grounded=response.grounded,
+            query_id=response.query_id,
+        )
+    except Exception as e:
+        logger.error("Query failed", error=str(e))
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/repos", response_model=ListRepositoriesResponse)
+async def list_repositories() -> ListRepositoriesResponse:
+    """List all repositories."""
+    return ListRepositoriesResponse(
+        repositories=[
+            RepositoryInfo(
+                id=repo.id,
+                url=repo.url,
+                branch=repo.branch,
+                chunk_count=repo.chunk_count,
+                status=repo.status.value,
+                indexed_at=repo.indexed_at,
+                error_message=repo.error_message,
+            )
+            for repo in repositories.values()
+        ]
+    )
+@router.get("/repos/{repo_id}", response_model=RepositoryInfo)
+async def get_repository(repo_id: str) -> RepositoryInfo:
+    """Get repository details.
+    Supports both full repository IDs and partial IDs (first 8+ characters).
+    """
+    repo = get_repo_or_404(repo_id)
+    return RepositoryInfo(
+        id=repo.id,
+        url=repo.url,
+        branch=repo.branch,
+        chunk_count=repo.chunk_count,
+        status=repo.status.value,
+        indexed_at=repo.indexed_at,
+        error_message=repo.error_message,
+    )
+@router.delete("/repos/{repo_id}")
+async def delete_repository(repo_id: str) -> dict:
+    """Delete a repository.
+    Supports both full repository IDs and partial IDs (first 8+ characters).
+    """
+    repo = get_repo_or_404(repo_id)
+    try:
+        # Delete from vector store (use resolved full ID)
+        vectorstore = VectorStore()
+        vectorstore.delete_repo_chunks(repo.id)
+        # Delete from records (use resolved full ID)
+        del repositories[repo.id]
+        save_repositories()
+        return {"message": f"Repository {repo.full_name} deleted"}
+    except Exception as e:
+        logger.error("Delete failed", error=str(e))
+        raise HTTPException(status_code=500, detail=str(e))

coderag/api/schemas.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""Pydantic schemas for REST API."""
+from datetime import datetime
+from typing import Optional
+from pydantic import BaseModel, Field, HttpUrl
+class IndexRepositoryRequest(BaseModel):
+    """Request to index a repository."""
+    url: str = Field(..., description="GitHub repository URL")
+    branch: Optional[str] = Field(None, description="Branch name (default: main)")
+    include_patterns: Optional[list[str]] = Field(None, description="File patterns to include")
+    exclude_patterns: Optional[list[str]] = Field(None, description="File patterns to exclude")
+class IndexRepositoryResponse(BaseModel):
+    """Response from indexing request."""
+    repo_id: str = Field(..., description="Repository ID")
+    status: str = Field(..., description="Indexing status")
+    message: str = Field(..., description="Status message")
+class QueryRequest(BaseModel):
+    """Request to query a repository."""
+    question: str = Field(..., description="Question about the repository")
+    repo_id: str = Field(..., description="Repository ID to query")
+    top_k: int = Field(5, ge=1, le=20, description="Number of chunks to retrieve")
+class CitationResponse(BaseModel):
+    """Citation information."""
+    file_path: str
+    start_line: int
+    end_line: int
+    class Config:
+        from_attributes = True
+class RetrievedChunkResponse(BaseModel):
+    """Retrieved chunk information."""
+    chunk_id: str
+    file_path: str
+    start_line: int
+    end_line: int
+    relevance_score: float
+    chunk_type: str
+    name: Optional[str] = None
+    content: str
+    class Config:
+        from_attributes = True
+class QueryResponse(BaseModel):
+    """Response from a query."""
+    answer: str = Field(..., description="Generated answer")
+    citations: list[CitationResponse] = Field(..., description="Citations in the answer")
+    retrieved_chunks: list[RetrievedChunkResponse] = Field(..., description="Evidence chunks")
+    grounded: bool = Field(..., description="Whether response is grounded in evidence")
+    query_id: str = Field(..., description="Query ID")
+class RepositoryInfo(BaseModel):
+    """Repository information."""
+    id: str
+    url: str
+    branch: str
+    chunk_count: int
+    status: str
+    indexed_at: Optional[datetime] = None
+    error_message: Optional[str] = None
+class ListRepositoriesResponse(BaseModel):
+    """List of repositories."""
+    repositories: list[RepositoryInfo]
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: str
+    app: str
+    version: str
+class ErrorResponse(BaseModel):
+    """Error response."""
+    error: str
+    detail: Optional[str] = None

coderag/cli.py ADDED Viewed

	@@ -0,0 +1,675 @@

+"""Unified CLI for CodeRAG."""
+import json
+import os
+import platform
+import shutil
+import sys
+from pathlib import Path
+from typing import Optional
+import click
+# Config directory and file
+CONFIG_DIR = Path.home() / ".config" / "coderag"
+CONFIG_FILE = CONFIG_DIR / "config.json"
+def get_config() -> dict:
+    """Load configuration from config file."""
+    if CONFIG_FILE.exists():
+        try:
+            return json.loads(CONFIG_FILE.read_text())
+        except Exception:
+            return {}
+    return {}
+def save_config(config: dict) -> None:
+    """Save configuration to config file."""
+    CONFIG_DIR.mkdir(parents=True, exist_ok=True)
+    CONFIG_FILE.write_text(json.dumps(config, indent=2))
+def get_claude_config_path() -> Optional[Path]:
+    """Get Claude Desktop config path based on OS."""
+    system = platform.system()
+    if system == "Darwin":  # macOS
+        return Path.home() / "Library" / "Application Support" / "Claude" / "claude_desktop_config.json"
+    elif system == "Linux":
+        return Path.home() / ".config" / "Claude" / "claude_desktop_config.json"
+    elif system == "Windows":
+        appdata = os.environ.get("APPDATA", "")
+        if appdata:
+            return Path(appdata) / "Claude" / "claude_desktop_config.json"
+    return None
+@click.group()
+@click.version_option(package_name="coderag")
+def cli():
+    """CodeRAG - RAG-based Q&A system for code repositories.
+    Use 'coderag setup' to configure, then 'coderag serve' to start.
+    For Claude Desktop integration, run 'coderag mcp-install'.
+    """
+    pass
+@cli.command()
+@click.option("--provider", type=click.Choice(["groq", "openai", "anthropic", "openrouter", "together", "local"]),
+              default=None, help="LLM provider to use")
+@click.option("--api-key", default=None, help="API key for the provider")
+def setup(provider: Optional[str], api_key: Optional[str]):
+    """Interactive setup wizard for CodeRAG.
+    Configures the LLM provider and API key. Configuration is saved to
+    ~/.config/coderag/config.json and can be overridden by environment variables.
+    """
+    config = get_config()
+    click.echo("\n🔧 CodeRAG Setup\n")
+    # Provider selection
+    if provider is None:
+        click.echo("Select your LLM provider:")
+        click.echo("  1. groq (FREE, fast - recommended)")
+        click.echo("  2. openai")
+        click.echo("  3. anthropic")
+        click.echo("  4. openrouter")
+        click.echo("  5. together")
+        click.echo("  6. local (requires GPU)")
+        choice = click.prompt("Enter choice", type=int, default=1)
+        providers = {1: "groq", 2: "openai", 3: "anthropic", 4: "openrouter", 5: "together", 6: "local"}
+        provider = providers.get(choice, "groq")
+    config["llm_provider"] = provider
+    # API key (not needed for local)
+    if provider != "local":
+        if api_key is None:
+            api_key_urls = {
+                "groq": "https://console.groq.com/keys",
+                "openai": "https://platform.openai.com/api-keys",
+                "anthropic": "https://console.anthropic.com/settings/keys",
+                "openrouter": "https://openrouter.ai/keys",
+                "together": "https://api.together.xyz/settings/api-keys",
+            }
+            url = api_key_urls.get(provider, "")
+            if url:
+                click.echo(f"\nGet your API key from: {url}")
+            api_key = click.prompt("Enter your API key", hide_input=True)
+        config["llm_api_key"] = api_key
+        # Validate API key
+        click.echo("\n⏳ Validating API key...")
+        if _validate_api_key(provider, api_key):
+            click.echo("✅ API key is valid!")
+        else:
+            click.echo("⚠️  Could not validate API key. It may still work.")
+    else:
+        click.echo("\n⚠️  Local mode requires a CUDA-capable GPU.")
+    # Save config
+    save_config(config)
+    click.echo(f"\n✅ Configuration saved to {CONFIG_FILE}")
+    # Next steps
+    click.echo("\n📋 Next steps:")
+    click.echo("  1. Run 'coderag serve' to start the web interface")
+    click.echo("  2. Run 'coderag mcp-install' to integrate with Claude Desktop")
+    click.echo("  3. Run 'coderag index <url>' to index a repository")
+def _validate_api_key(provider: str, api_key: str) -> bool:
+    """Validate API key by making a test request."""
+    try:
+        from openai import OpenAI
+        base_urls = {
+            "groq": "https://api.groq.com/openai/v1",
+            "openai": "https://api.openai.com/v1",
+            "openrouter": "https://openrouter.ai/api/v1",
+            "together": "https://api.together.xyz/v1",
+        }
+        if provider not in base_urls:
+            return True  # Can't validate, assume OK
+        client = OpenAI(api_key=api_key, base_url=base_urls[provider])
+        client.models.list()
+        return True
+    except Exception:
+        return False
+@cli.command()
+@click.option("--host", default="0.0.0.0", help="Host to bind to")
+@click.option("--port", default=8000, type=int, help="Port to bind to")
+@click.option("--reload", is_flag=True, help="Enable auto-reload for development")
+def serve(host: str, port: int, reload: bool):
+    """Start the CodeRAG web server.
+    Starts the FastAPI server with Gradio UI, REST API, and MCP endpoint.
+    """
+    # Apply config from file to environment
+    _apply_config_to_env()
+    import uvicorn
+    from coderag.main import create_app
+    from coderag.config import get_settings
+    settings = get_settings()
+    app = create_app()
+    click.echo(f"\n🚀 Starting CodeRAG server at http://{host}:{port}")
+    click.echo("   Press Ctrl+C to stop\n")
+    uvicorn.run(
+        app,
+        host=host,
+        port=port,
+        reload=reload,
+        log_level=settings.server.log_level,
+    )
+@cli.command("mcp-run")
+def mcp_run():
+    """Run MCP server in stdio mode (for Claude Desktop).
+    This command is used by Claude Desktop to communicate with CodeRAG.
+    You typically don't need to run this manually.
+    """
+    # Apply config from file to environment
+    _apply_config_to_env()
+    # Suppress all output except MCP protocol
+    import logging
+    logging.basicConfig(level=logging.WARNING, stream=sys.stderr)
+    import structlog
+    structlog.configure(
+        wrapper_class=structlog.make_filtering_bound_logger(logging.CRITICAL),
+    )
+    from coderag.mcp.server import create_mcp_server
+    mcp = create_mcp_server()
+    mcp.run(transport="stdio")
+@cli.command("mcp-install")
+@click.option("--dry-run", is_flag=True, help="Preview changes without applying")
+def mcp_install(dry_run: bool):
+    """Configure Claude Desktop to use CodeRAG MCP.
+    Automatically detects your OS and updates the Claude Desktop configuration
+    to include the CodeRAG MCP server.
+    """
+    config_path = get_claude_config_path()
+    if config_path is None:
+        click.echo("❌ Could not determine Claude Desktop config location.")
+        click.echo("   Please manually add the MCP configuration.")
+        sys.exit(1)
+    click.echo(f"\n🔍 Claude Desktop config: {config_path}")
+    # Check if Claude Desktop is installed
+    if not config_path.parent.exists():
+        click.echo("\n❌ Claude Desktop does not appear to be installed.")
+        click.echo("   Install it from: https://claude.ai/download")
+        sys.exit(1)
+    # Load existing config or create new
+    if config_path.exists():
+        try:
+            config = json.loads(config_path.read_text())
+        except json.JSONDecodeError:
+            click.echo("⚠️  Existing config is invalid JSON. Creating new config.")
+            config = {}
+    else:
+        config = {}
+    # Ensure mcpServers key exists
+    if "mcpServers" not in config:
+        config["mcpServers"] = {}
+    # Find the coderag-mcp command path
+    coderag_path = shutil.which("coderag")
+    if coderag_path is None:
+        # Fallback to python -m
+        python_path = sys.executable
+        mcp_command = [python_path, "-m", "coderag.mcp.cli"]
+    else:
+        mcp_command = [coderag_path, "mcp-run"]
+    # Prepare MCP server config
+    new_mcp_config = {
+        "command": mcp_command[0],
+        "args": mcp_command[1:] if len(mcp_command) > 1 else [],
+    }
+    # Check if already configured
+    existing = config["mcpServers"].get("coderag")
+    if existing == new_mcp_config:
+        click.echo("\n✅ CodeRAG MCP is already configured correctly!")
+        return
+    # Show diff
+    click.echo("\n📝 Changes to be made:")
+    if existing:
+        click.echo(f"   Update: mcpServers.coderag")
+        click.echo(f"   From: {json.dumps(existing)}")
+        click.echo(f"   To:   {json.dumps(new_mcp_config)}")
+    else:
+        click.echo(f"   Add: mcpServers.coderag = {json.dumps(new_mcp_config)}")
+    if dry_run:
+        click.echo("\n🔍 Dry run - no changes made.")
+        return
+    # Backup existing config
+    if config_path.exists():
+        backup_path = config_path.with_suffix(".json.backup")
+        shutil.copy(config_path, backup_path)
+        click.echo(f"\n📦 Backup saved to: {backup_path}")
+    # Apply changes
+    config["mcpServers"]["coderag"] = new_mcp_config
+    config_path.parent.mkdir(parents=True, exist_ok=True)
+    config_path.write_text(json.dumps(config, indent=2))
+    click.echo("\n✅ Claude Desktop configuration updated!")
+    click.echo("\n⚠️  Please restart Claude Desktop to apply changes.")
+@cli.command("index")
+@click.argument("url")
+@click.option("--branch", default="", help="Branch to index (default: main/master)")
+def index(url: str, branch: str):
+    """Index a GitHub repository.
+    URL: The GitHub repository URL to index.
+    Example: coderag index https://github.com/owner/repo
+    """
+    # Apply config from file to environment
+    _apply_config_to_env()
+    import asyncio
+    from coderag.mcp.handlers import get_mcp_handlers
+    click.echo(f"\n📦 Indexing repository: {url}")
+    if branch:
+        click.echo(f"   Branch: {branch}")
+    handlers = get_mcp_handlers()
+    async def run_index():
+        result = await handlers.index_repository(url=url, branch=branch)
+        return result
+    result = asyncio.run(run_index())
+    if result.get("success"):
+        click.echo(f"\n✅ Repository indexed successfully!")
+        click.echo(f"   Repo ID: {result['repo_id']}")
+        click.echo(f"   Name: {result['name']}")
+        click.echo(f"   Files processed: {result['files_processed']}")
+        click.echo(f"   Chunks indexed: {result['chunks_indexed']}")
+        click.echo(f"\n   Use 'coderag query {result['repo_id'][:8]} \"your question\"' to query")
+    else:
+        click.echo(f"\n❌ Indexing failed: {result.get('error', 'Unknown error')}")
+        sys.exit(1)
+@cli.command("query")
+@click.argument("repo_id")
+@click.argument("question")
+@click.option("--top-k", default=5, type=int, help="Number of chunks to retrieve")
+@click.option("--format", "output_format", type=click.Choice(["text", "json"]), default="text", help="Output format")
+def query(repo_id: str, question: str, top_k: int, output_format: str):
+    """Ask a question about an indexed repository.
+    REPO_ID: Repository ID (full or first 8 characters)
+    QUESTION: Your question about the code
+    Example: coderag query abc12345 "How does authentication work?"
+    """
+    # Apply config from file to environment
+    _apply_config_to_env()
+    import asyncio
+    from coderag.mcp.handlers import get_mcp_handlers
+    handlers = get_mcp_handlers()
+    async def run_query():
+        result = await handlers.query_code(repo_id=repo_id, question=question, top_k=top_k)
+        return result
+    click.echo(f"\n🔍 Querying: {question}\n")
+    result = asyncio.run(run_query())
+    if result.get("error"):
+        click.echo(f"❌ Error: {result['error']}")
+        sys.exit(1)
+    if output_format == "json":
+        click.echo(json.dumps(result, indent=2))
+    else:
+        click.echo("📝 Answer:\n")
+        click.echo(result.get("answer", "No answer generated."))
+        if result.get("citations"):
+            click.echo("\n📍 Citations:")
+            for citation in result["citations"]:
+                click.echo(f"   {citation}")
+        if result.get("evidence"):
+            click.echo("\n📂 Evidence:")
+            for chunk in result["evidence"][:3]:  # Show top 3
+                click.echo(f"   - {chunk['file']}:{chunk['start_line']}-{chunk['end_line']} (relevance: {chunk['relevance']})")
+@cli.command("repos")
+@click.option("--format", "output_format", type=click.Choice(["text", "json"]), default="text", help="Output format")
+def repos(output_format: str):
+    """List all indexed repositories."""
+    # Apply config from file to environment
+    _apply_config_to_env()
+    import asyncio
+    from coderag.mcp.handlers import get_mcp_handlers
+    handlers = get_mcp_handlers()
+    async def run_list():
+        result = await handlers.list_repositories()
+        return result
+    result = asyncio.run(run_list())
+    if output_format == "json":
+        click.echo(json.dumps(result, indent=2))
+    else:
+        repos_list = result.get("repositories", [])
+        if not repos_list:
+            click.echo("\n📭 No repositories indexed yet.")
+            click.echo("   Run 'coderag index <url>' to index a repository.")
+            return
+        click.echo(f"\n📚 Indexed Repositories ({len(repos_list)}):\n")
+        for repo in repos_list:
+            status_icon = "✅" if repo["status"] == "ready" else "⏳" if repo["status"] == "indexing" else "❌"
+            click.echo(f"   {status_icon} {repo['id'][:8]}  {repo['name']} ({repo['branch']})")
+            click.echo(f"      Chunks: {repo['chunk_count']} | Indexed: {repo.get('indexed_at', 'N/A')}")
+@cli.command("update")
+@click.argument("repo_id")
+def update(repo_id: str):
+    """Update an indexed repository with latest changes.
+    REPO_ID: Repository ID (full or first 8 characters)
+    Fetches the latest changes from GitHub and re-indexes only the modified files.
+    This is faster than a full re-index for repositories with frequent updates.
+    Example: coderag update abc12345
+    """
+    # Apply config from file to environment
+    _apply_config_to_env()
+    import asyncio
+    from coderag.mcp.handlers import get_mcp_handlers
+    click.echo(f"\n🔄 Updating repository: {repo_id}\n")
+    handlers = get_mcp_handlers()
+    async def run_update():
+        result = await handlers.update_repository(repo_id=repo_id)
+        return result
+    result = asyncio.run(run_update())
+    if result.get("error"):
+        click.echo(f"❌ Error: {result['error']}")
+        sys.exit(1)
+    if result.get("message") == "Repository is already up to date":
+        click.echo("✅ Repository is already up to date!")
+    else:
+        click.echo("✅ Repository updated successfully!")
+        click.echo(f"   Files changed: {result.get('files_changed', 0)}")
+        click.echo(f"   - Added: {result.get('files_added', 0)}")
+        click.echo(f"   - Modified: {result.get('files_modified', 0)}")
+        click.echo(f"   - Deleted: {result.get('files_deleted', 0)}")
+        click.echo(f"   Chunks added: {result.get('chunks_added', 0)}")
+        click.echo(f"   Chunks deleted: {result.get('chunks_deleted', 0)}")
+        click.echo(f"   Total chunks: {result.get('total_chunks', 0)}")
+@cli.command("delete")
+@click.argument("repo_id")
+@click.option("--force", "-f", is_flag=True, help="Skip confirmation prompt")
+def delete(repo_id: str, force: bool):
+    """Delete an indexed repository.
+    REPO_ID: Repository ID (full or first 8 characters)
+    Removes the repository from the index and deletes all associated chunks
+    from the vector store.
+    Example: coderag delete abc12345
+    """
+    # Apply config from file to environment
+    _apply_config_to_env()
+    import asyncio
+    from coderag.mcp.handlers import get_mcp_handlers
+    handlers = get_mcp_handlers()
+    # First get repo info for confirmation
+    async def get_repo_info():
+        result = await handlers.get_repository_info(repo_id=repo_id)
+        return result
+    info = asyncio.run(get_repo_info())
+    if info.get("error"):
+        click.echo(f"❌ Error: {info['error']}")
+        sys.exit(1)
+    repo_name = info.get("name", repo_id)
+    chunk_count = info.get("chunk_count", 0)
+    if not force:
+        click.echo(f"\n⚠️  About to delete: {repo_name}")
+        click.echo(f"   Chunks to delete: {chunk_count}")
+        if not click.confirm("\nAre you sure?"):
+            click.echo("Cancelled.")
+            return
+    async def run_delete():
+        result = await handlers.delete_repository(repo_id=repo_id)
+        return result
+    result = asyncio.run(run_delete())
+    if result.get("error"):
+        click.echo(f"❌ Error: {result['error']}")
+        sys.exit(1)
+    click.echo(f"\n✅ Repository deleted: {result.get('name', repo_id)}")
+    click.echo(f"   Chunks removed: {result.get('chunks_deleted', 0)}")
+@cli.command("clean")
+@click.option("--force", "-f", is_flag=True, help="Skip confirmation prompt")
+def clean(force: bool):
+    """Clean up repositories with errors or stuck in indexing.
+    Removes all repositories that have status 'error' or have been stuck
+    in 'indexing' or 'pending' status for too long.
+    Example: coderag clean
+    """
+    # Apply config from file to environment
+    _apply_config_to_env()
+    import asyncio
+    from coderag.mcp.handlers import get_mcp_handlers
+    handlers = get_mcp_handlers()
+    async def get_repos():
+        result = await handlers.list_repositories()
+        return result
+    result = asyncio.run(get_repos())
+    repos = result.get("repositories", [])
+    # Find repos to clean
+    to_clean = [r for r in repos if r["status"] in ("error", "indexing", "pending")]
+    if not to_clean:
+        click.echo("\n✅ No repositories need cleaning.")
+        return
+    click.echo(f"\n🧹 Found {len(to_clean)} repository(ies) to clean:\n")
+    for repo in to_clean:
+        status_icon = "❌" if repo["status"] == "error" else "⏳"
+        click.echo(f"   {status_icon} {repo['id'][:8]}  {repo['name']} ({repo['status']})")
+    if not force:
+        if not click.confirm(f"\nDelete these {len(to_clean)} repositories?"):
+            click.echo("Cancelled.")
+            return
+    # Delete each repo
+    deleted = 0
+    for repo in to_clean:
+        async def run_delete():
+            return await handlers.delete_repository(repo_id=repo["id"])
+        try:
+            result = asyncio.run(run_delete())
+            if result.get("success"):
+                deleted += 1
+                click.echo(f"   ✅ Deleted: {repo['name']}")
+            else:
+                click.echo(f"   ❌ Failed: {repo['name']} - {result.get('error', 'Unknown')}")
+        except Exception as e:
+            click.echo(f"   ❌ Failed: {repo['name']} - {str(e)}")
+    click.echo(f"\n✅ Cleaned {deleted}/{len(to_clean)} repositories.")
+@cli.command("doctor")
+def doctor():
+    """Diagnose common issues with CodeRAG setup.
+    Checks Python version, configuration, API key validity, and system components.
+    """
+    click.echo("\n🏥 CodeRAG Doctor\n")
+    all_ok = True
+    # Check Python version
+    py_version = sys.version_info
+    if py_version >= (3, 11):
+        click.echo(f"✅ Python version: {py_version.major}.{py_version.minor}.{py_version.micro}")
+    else:
+        click.echo(f"❌ Python version: {py_version.major}.{py_version.minor}.{py_version.micro} (need 3.11+)")
+        all_ok = False
+    # Check config file
+    config = get_config()
+    if config:
+        click.echo(f"✅ Config file exists: {CONFIG_FILE}")
+        if config.get("llm_provider"):
+            click.echo(f"   Provider: {config['llm_provider']}")
+    else:
+        click.echo(f"⚠️  No config file. Run 'coderag setup' to configure.")
+    # Check API key
+    api_key = config.get("llm_api_key") or os.environ.get("MODEL_LLM_API_KEY")
+    provider = config.get("llm_provider") or os.environ.get("MODEL_LLM_PROVIDER", "groq")
+    if provider != "local":
+        if api_key:
+            click.echo(f"✅ API key configured (provider: {provider})")
+        else:
+            click.echo(f"❌ No API key configured for {provider}")
+            all_ok = False
+    # Check CUDA
+    try:
+        import torch
+        if torch.cuda.is_available():
+            click.echo(f"✅ CUDA available: {torch.cuda.get_device_name(0)}")
+        else:
+            click.echo("ℹ️  CUDA not available (CPU mode for embeddings)")
+    except ImportError:
+        click.echo("⚠️  PyTorch not installed")
+        all_ok = False
+    # Check ChromaDB data directory
+    from coderag.config import get_settings
+    settings = get_settings()
+    chroma_path = settings.vectorstore.persist_directory
+    if chroma_path.exists():
+        click.echo(f"✅ ChromaDB directory: {chroma_path}")
+    else:
+        click.echo(f"ℹ️  ChromaDB directory will be created: {chroma_path}")
+    # Check Claude Desktop
+    claude_config = get_claude_config_path()
+    if claude_config and claude_config.exists():
+        try:
+            config_data = json.loads(claude_config.read_text())
+            if "coderag" in config_data.get("mcpServers", {}):
+                click.echo("✅ Claude Desktop MCP configured")
+            else:
+                click.echo("ℹ️  Claude Desktop installed but MCP not configured. Run 'coderag mcp-install'")
+        except Exception:
+            click.echo("⚠️  Claude Desktop config exists but could not be read")
+    else:
+        click.echo("ℹ️  Claude Desktop not detected")
+    # Summary
+    if all_ok:
+        click.echo("\n✅ All checks passed!")
+    else:
+        click.echo("\n⚠️  Some issues detected. See above for details.")
+def _apply_config_to_env():
+    """Apply configuration from config file to environment variables."""
+    config = get_config()
+    if config.get("llm_provider") and not os.environ.get("MODEL_LLM_PROVIDER"):
+        os.environ["MODEL_LLM_PROVIDER"] = config["llm_provider"]
+    if config.get("llm_api_key") and not os.environ.get("MODEL_LLM_API_KEY"):
+        os.environ["MODEL_LLM_API_KEY"] = config["llm_api_key"]
+    if config.get("embedding_device") and not os.environ.get("MODEL_EMBEDDING_DEVICE"):
+        os.environ["MODEL_EMBEDDING_DEVICE"] = config["embedding_device"]
+def main():
+    """Entry point for the CLI."""
+    cli()
+if __name__ == "__main__":
+    main()

coderag/config.py ADDED Viewed

	@@ -0,0 +1,154 @@

+"""Application configuration using pydantic-settings."""
+from pathlib import Path
+from typing import Optional
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class ModelSettings(BaseSettings):
+    """LLM and embedding model configuration."""
+    model_config = SettingsConfigDict(env_prefix="MODEL_")
+    # LLM Provider: "local", "openai", "groq", "anthropic", "openrouter"
+    # Default to "groq" (free tier available, no GPU required)
+    llm_provider: str = "groq"
+    # API settings (for remote providers)
+    llm_api_key: Optional[str] = None
+    llm_api_base: Optional[str] = None  # Custom API base URL
+    # Model name (local or remote)
+    llm_name: str = "Qwen/Qwen2.5-Coder-3B-Instruct"
+    llm_max_new_tokens: int = 1024
+    llm_temperature: float = 0.1
+    llm_top_p: float = 0.95
+    # Local model settings
+    llm_use_4bit: bool = True
+    llm_device_map: str = "auto"
+    embedding_name: str = "nomic-ai/nomic-embed-text-v1.5"
+    embedding_dimension: int = 768
+    embedding_batch_size: int = 8  # Reduced for 8GB VRAM GPUs
+    embedding_device: str = "auto"  # "auto" detects CUDA, falls back to CPU
+class VectorStoreSettings(BaseSettings):
+    """ChromaDB vector store configuration."""
+    model_config = SettingsConfigDict(env_prefix="VECTORSTORE_")
+    persist_directory: Path = Path("./data/chroma_db")
+    collection_name: str = "coderag_chunks"
+    distance_metric: str = "cosine"
+    anonymized_telemetry: bool = False
+class IngestionSettings(BaseSettings):
+    """Repository ingestion configuration."""
+    model_config = SettingsConfigDict(env_prefix="INGESTION_")
+    repos_cache_dir: Path = Path("./data/repos")
+    max_file_size_kb: int = 500
+    default_branch: str = "main"
+    chunk_size: int = 1500
+    chunk_overlap: int = 200
+    # Large repository handling
+    max_files_per_repo: int = 5000
+    max_total_chunks: int = 50000
+    batch_size: int = 100
+    stream_processing: bool = True
+    # Warning thresholds
+    warn_files_threshold: int = 1000
+    warn_chunks_threshold: int = 10000
+    include_patterns: list[str] = Field(
+        default_factory=lambda: ["*.py", "*.js", "*.ts", "*.java", "*.go", "*.rs", "*.c", "*.cpp", "*.h"]
+    )
+    exclude_patterns: list[str] = Field(
+        default_factory=lambda: [
+            "**/node_modules/**",
+            "**/.git/**",
+            "**/venv/**",
+            "**/__pycache__/**",
+            "**/dist/**",
+            "**/build/**",
+            "**/*.min.js",
+            "**/*.min.css",
+            "**/package-lock.json",
+            "**/yarn.lock",
+            "**/poetry.lock",
+            "**/.env",
+            "**/.env.*",
+            "**/credentials*",
+            "**/*secret*",
+            "**/*password*",
+        ]
+    )
+class RetrievalSettings(BaseSettings):
+    """Retrieval configuration."""
+    model_config = SettingsConfigDict(env_prefix="RETRIEVAL_")
+    default_top_k: int = 5
+    max_top_k: int = 20
+    similarity_threshold: float = 0.3
+class ServerSettings(BaseSettings):
+    """Server configuration."""
+    model_config = SettingsConfigDict(env_prefix="SERVER_")
+    host: str = "0.0.0.0"
+    port: int = 8000
+    reload: bool = False
+    workers: int = 1
+    log_level: str = "info"
+class Settings(BaseSettings):
+    """Main application settings."""
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+    )
+    app_name: str = "CodeRAG"
+    app_version: str = "0.1.0"
+    debug: bool = False
+    data_dir: Path = Path("./data")
+    models: ModelSettings = Field(default_factory=ModelSettings)
+    vectorstore: VectorStoreSettings = Field(default_factory=VectorStoreSettings)
+    ingestion: IngestionSettings = Field(default_factory=IngestionSettings)
+    retrieval: RetrievalSettings = Field(default_factory=RetrievalSettings)
+    server: ServerSettings = Field(default_factory=ServerSettings)
+    def ensure_directories(self) -> None:
+        """Create required directories if they don't exist."""
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self.vectorstore.persist_directory.mkdir(parents=True, exist_ok=True)
+        self.ingestion.repos_cache_dir.mkdir(parents=True, exist_ok=True)
+_settings: Optional[Settings] = None
+def get_settings() -> Settings:
+    """Get cached settings instance."""
+    global _settings
+    if _settings is None:
+        _settings = Settings()
+        _settings.ensure_directories()
+    return _settings

coderag/generation/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Generation module: LLM inference and response generation with citations."""
+from coderag.generation.generator import ResponseGenerator
+from coderag.generation.prompts import SYSTEM_PROMPT, build_prompt
+from coderag.generation.citations import CitationParser
+__all__ = ["ResponseGenerator", "SYSTEM_PROMPT", "build_prompt", "CitationParser"]

coderag/generation/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (531 Bytes). View file

coderag/generation/__pycache__/citations.cpython-313.pyc ADDED Viewed

Binary file (3.8 kB). View file

coderag/generation/__pycache__/generator.cpython-313.pyc ADDED Viewed

Binary file (10.5 kB). View file

coderag/generation/__pycache__/prompts.cpython-313.pyc ADDED Viewed

Binary file (3.15 kB). View file

coderag/generation/citations.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""Citation parsing and formatting."""
+import re
+from typing import Optional
+from coderag.models.response import Citation
+class CitationParser:
+    """Parses and validates citations from LLM responses."""
+    # Pattern to match citations like [file.py:10-20] or [path/to/file.py:10-20]
+    CITATION_PATTERN = re.compile(r"\[([^\]]+):(\d+)-(\d+)\]")
+    def parse_citations(self, text: str) -> list[Citation]:
+        """Extract all citations from text.
+        Args:
+            text: Text containing citations
+        Returns:
+            List of parsed Citation objects
+        """
+        citations = []
+        for match in self.CITATION_PATTERN.finditer(text):
+            file_path = match.group(1)
+            start_line = int(match.group(2))
+            end_line = int(match.group(3))
+            citations.append(Citation(
+                file_path=file_path,
+                start_line=start_line,
+                end_line=end_line,
+            ))
+        return citations
+    def validate_citation(self, citation: Citation, available_files: set[str]) -> bool:
+        """Check if a citation references an existing file."""
+        return citation.file_path in available_files
+    def validate_citations(
+        self,
+        citations: list[Citation],
+        available_files: set[str],
+    ) -> tuple[list[Citation], list[Citation]]:
+        """Validate multiple citations.
+        Returns:
+            Tuple of (valid_citations, invalid_citations)
+        """
+        valid = []
+        invalid = []
+        for citation in citations:
+            if self.validate_citation(citation, available_files):
+                valid.append(citation)
+            else:
+                invalid.append(citation)
+        return valid, invalid
+    def format_citation(self, file_path: str, start_line: int, end_line: int) -> str:
+        """Format a citation string."""
+        return f"[{file_path}:{start_line}-{end_line}]"
+    def has_citations(self, text: str) -> bool:
+        """Check if text contains any citations."""
+        return bool(self.CITATION_PATTERN.search(text))
+    def count_citations(self, text: str) -> int:
+        """Count citations in text."""
+        return len(self.CITATION_PATTERN.findall(text))
+    def extract_unique_files(self, citations: list[Citation]) -> set[str]:
+        """Get unique file paths from citations."""
+        return {c.file_path for c in citations}

coderag/generation/generator.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""Response generation using local or remote LLMs."""
+from typing import Optional
+from coderag.config import get_settings
+from coderag.generation.citations import CitationParser
+from coderag.generation.prompts import SYSTEM_PROMPT, build_prompt, build_no_context_response
+from coderag.logging import get_logger
+from coderag.models.response import Response
+from coderag.models.query import Query
+from coderag.retrieval.retriever import Retriever
+logger = get_logger(__name__)
+class ResponseGenerator:
+    """Generates grounded responses using local or remote LLMs."""
+    def __init__(
+        self,
+        retriever: Optional[Retriever] = None,
+    ) -> None:
+        self.settings = get_settings()
+        self.retriever = retriever or Retriever()
+        self.citation_parser = CitationParser()
+        self.provider = self.settings.models.llm_provider.lower()
+        self._client = None
+        self._local_model = None
+        self._local_tokenizer = None
+        logger.info("ResponseGenerator initialized", provider=self.provider)
+    def _get_api_client(self):
+        """Get or create API client for remote providers."""
+        if self._client is not None:
+            return self._client
+        import httpx
+        from openai import OpenAI
+        api_key = self.settings.models.llm_api_key
+        if not api_key:
+            raise ValueError(f"API key required for provider: {self.provider}")
+        # Provider-specific configurations
+        provider_configs = {
+            "openai": {
+                "base_url": "https://api.openai.com/v1",
+                "default_model": "gpt-4o-mini",
+            },
+            "groq": {
+                "base_url": "https://api.groq.com/openai/v1",
+                "default_model": "llama-3.3-70b-versatile",
+            },
+            "anthropic": {
+                "base_url": "https://api.anthropic.com/v1",
+                "default_model": "claude-3-5-sonnet-20241022",
+            },
+            "openrouter": {
+                "base_url": "https://openrouter.ai/api/v1",
+                "default_model": "anthropic/claude-3.5-sonnet",
+            },
+            "together": {
+                "base_url": "https://api.together.xyz/v1",
+                "default_model": "meta-llama/Llama-3.3-70B-Instruct-Turbo",
+            },
+        }
+        config = provider_configs.get(self.provider, {})
+        base_url = self.settings.models.llm_api_base or config.get("base_url")
+        if not base_url:
+            raise ValueError(f"Unknown provider: {self.provider}")
+        # Set default model if not specified and it's a known provider
+        if self.settings.models.llm_name.startswith("Qwen/"):
+            self.model_name = config.get("default_model", self.settings.models.llm_name)
+        else:
+            self.model_name = self.settings.models.llm_name
+        self._client = OpenAI(
+            api_key=api_key,
+            base_url=base_url,
+            http_client=httpx.Client(timeout=120.0),
+        )
+        logger.info("API client created", provider=self.provider, model=self.model_name)
+        return self._client
+    def _load_local_model(self):
+        """Load local model with transformers."""
+        if self._local_model is not None:
+            return
+        import torch
+        from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+        if not torch.cuda.is_available():
+            raise RuntimeError(
+                "Local LLM requires a CUDA-capable GPU. Options:\n"
+                "  1. Use a cloud provider (free): MODEL_LLM_PROVIDER=groq\n"
+                "     Get API key at: https://console.groq.com/keys\n"
+                "  2. Install CUDA and a compatible GPU"
+            )
+        logger.info("Loading local LLM", model=self.settings.models.llm_name)
+        if self.settings.models.llm_use_4bit:
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+            )
+        else:
+            bnb_config = None
+        self._local_tokenizer = AutoTokenizer.from_pretrained(
+            self.settings.models.llm_name,
+            trust_remote_code=True,
+        )
+        self._local_model = AutoModelForCausalLM.from_pretrained(
+            self.settings.models.llm_name,
+            quantization_config=bnb_config,
+            device_map=self.settings.models.llm_device_map,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
+        )
+        logger.info("Local LLM loaded successfully")
+    def generate(self, query: Query) -> Response:
+        """Generate a response for a query."""
+        # Retrieve relevant chunks
+        chunks, context = self.retriever.retrieve_with_context(
+            query.question,
+            query.repo_id,
+            query.top_k,
+        )
+        # Handle no results
+        if not chunks:
+            return Response(
+                answer=build_no_context_response(),
+                citations=[],
+                retrieved_chunks=[],
+                grounded=False,
+                query_id=query.id,
+            )
+        # Build prompt and generate
+        prompt = build_prompt(query.question, context)
+        if self.provider == "local":
+            answer = self._generate_local(prompt)
+        else:
+            answer = self._generate_api(prompt)
+        # Parse citations from answer
+        citations = self.citation_parser.parse_citations(answer)
+        # Determine if response is grounded
+        grounded = len(citations) > 0 and len(chunks) > 0
+        return Response(
+            answer=answer,
+            citations=citations,
+            retrieved_chunks=chunks,
+            grounded=grounded,
+            query_id=query.id,
+        )
+    def _generate_api(self, prompt: str) -> str:
+        """Generate using remote API."""
+        client = self._get_api_client()
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ]
+        response = client.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=self.settings.models.llm_max_new_tokens,
+            temperature=self.settings.models.llm_temperature,
+            top_p=self.settings.models.llm_top_p,
+        )
+        return response.choices[0].message.content.strip()
+    def _generate_local(self, prompt: str) -> str:
+        """Generate using local model."""
+        import torch
+        self._load_local_model()
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ]
+        text = self._local_tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        inputs = self._local_tokenizer(text, return_tensors="pt").to(self._local_model.device)
+        with torch.no_grad():
+            outputs = self._local_model.generate(
+                **inputs,
+                max_new_tokens=self.settings.models.llm_max_new_tokens,
+                temperature=self.settings.models.llm_temperature,
+                top_p=self.settings.models.llm_top_p,
+                do_sample=True,
+                pad_token_id=self._local_tokenizer.eos_token_id,
+            )
+        generated = outputs[0][inputs["input_ids"].shape[1]:]
+        response = self._local_tokenizer.decode(generated, skip_special_tokens=True)
+        return response.strip()
+    def unload(self) -> None:
+        """Unload models from memory."""
+        if self._local_model is not None:
+            del self._local_model
+            self._local_model = None
+        if self._local_tokenizer is not None:
+            del self._local_tokenizer
+            self._local_tokenizer = None
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        logger.info("Models unloaded")

coderag/generation/prompts.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""System prompts for grounded code Q&A."""
+SYSTEM_PROMPT = """You are a code assistant that answers questions about a repository.
+CRITICAL RULES - YOU MUST FOLLOW THESE:
+1. FIRST, check if the retrieved chunks are RELEVANT to the question being asked.
+   - If the chunks discuss completely different topics than the question, respond:
+     "I could not find information about this in the indexed repository."
+   - Do NOT try to make connections that don't exist.
+2. Only answer based on EXPLICIT information in the provided code chunks.
+   - Every claim MUST have a citation: [file_path:start_line-end_line]
+   - If you cannot cite it, do NOT say it.
+3. NEVER HALLUCINATE:
+   - Do NOT invent code, functions, files, or behaviors
+   - Do NOT answer questions about topics not in the chunks (e.g., if asked about "food inventory" but chunks are about "code embeddings", say you don't have that information)
+   - Do NOT make assumptions about what the code might do
+4. When to refuse:
+   - The question is about something not covered in the chunks
+   - The chunks are about a completely different topic
+   - You would need to guess or speculate
+CITATION FORMAT: [file_path:start_line-end_line]
+Example: [src/auth.py:45-78]
+RESPONSE FORMAT:
+- Start with a direct answer IF AND ONLY IF the chunks contain relevant information
+- Include citations inline with every factual statement
+- If showing code, quote it exactly from the chunks"""
+def build_prompt(question: str, context: str) -> str:
+    """Build the full prompt with context and question.
+    Args:
+        question: User's question
+        context: Retrieved code chunks formatted as context
+    Returns:
+        Complete prompt for the LLM
+    """
+    return f"""Based on the following code chunks from the repository, answer the question.
+## Retrieved Code Chunks
+{context}
+## Question
+{question}
+## Answer
+"""
+def build_no_context_response() -> str:
+    """Response when no relevant context is found."""
+    return "I could not find information about this in the indexed repository."
+def build_clarification_prompt(question: str, ambiguities: list[str]) -> str:
+    """Build prompt asking for clarification."""
+    ambiguity_list = "\n".join(f"- {a}" for a in ambiguities)
+    return f"""Your question "{question}" is ambiguous. Could you clarify:
+{ambiguity_list}
+Please provide more specific details so I can give you an accurate answer."""

coderag/indexing/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Indexing module: Embedding generation and vector storage."""
+from coderag.indexing.embeddings import EmbeddingGenerator
+from coderag.indexing.vectorstore import VectorStore
+__all__ = ["EmbeddingGenerator", "VectorStore"]

coderag/indexing/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (416 Bytes). View file

coderag/indexing/__pycache__/embeddings.cpython-313.pyc ADDED Viewed

Binary file (8.05 kB). View file

coderag/indexing/__pycache__/vectorstore.cpython-313.pyc ADDED Viewed

Binary file (8.95 kB). View file

coderag/indexing/embeddings.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""Embedding generation using nomic-embed-text."""
+from typing import Iterator, Optional
+import torch
+from sentence_transformers import SentenceTransformer
+from coderag.config import get_settings
+from coderag.logging import get_logger
+from coderag.models.chunk import Chunk
+logger = get_logger(__name__)
+class EmbeddingGenerator:
+    """Generates embeddings using nomic-embed-text v1.5."""
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        device: Optional[str] = None,
+        batch_size: Optional[int] = None,
+    ) -> None:
+        settings = get_settings()
+        self.model_name = model_name or settings.models.embedding_name
+        self.device = self._resolve_device(device or settings.models.embedding_device)
+        self.batch_size = batch_size or settings.models.embedding_batch_size
+        self._model: Optional[SentenceTransformer] = None
+    def _resolve_device(self, device: str) -> str:
+        """Resolve device, falling back to CPU if CUDA unavailable."""
+        if device == "auto":
+            return "cuda" if torch.cuda.is_available() else "cpu"
+        if device == "cuda" and not torch.cuda.is_available():
+            logger.warning("CUDA not available, falling back to CPU for embeddings")
+            return "cpu"
+        return device
+    @property
+    def model(self) -> SentenceTransformer:
+        if self._model is None:
+            self._load_model()
+        return self._model
+    def _load_model(self) -> None:
+        logger.info("Loading embedding model", model=self.model_name, device=self.device)
+        self._model = SentenceTransformer(
+            self.model_name,
+            device=self.device,
+            trust_remote_code=True,
+        )
+        logger.info("Embedding model loaded")
+    def generate_embedding(self, text: str, is_query: bool = False) -> list[float]:
+        # nomic-embed uses task prefixes
+        if is_query:
+            text = f"search_query: {text}"
+        else:
+            text = f"search_document: {text}"
+        embedding = self.model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
+        return embedding.tolist()
+    def generate_embeddings(
+        self,
+        texts: list[str],
+        is_query: bool = False,
+        show_progress: bool = True,
+    ) -> list[list[float]]:
+        # Add prefixes
+        if is_query:
+            texts = [f"search_query: {t}" for t in texts]
+        else:
+            texts = [f"search_document: {t}" for t in texts]
+        embeddings = self.model.encode(
+            texts,
+            batch_size=self.batch_size,
+            convert_to_numpy=True,
+            normalize_embeddings=True,
+            show_progress_bar=show_progress,
+        )
+        return embeddings.tolist()
+    def embed_chunks(
+        self,
+        chunks: list[Chunk],
+        show_progress: bool = True,
+    ) -> list[Chunk]:
+        if not chunks:
+            return []
+        logger.info("Generating embeddings", num_chunks=len(chunks))
+        texts = [self._chunk_to_text(chunk) for chunk in chunks]
+        embeddings = self.generate_embeddings(texts, is_query=False, show_progress=show_progress)
+        for chunk, embedding in zip(chunks, embeddings):
+            chunk.embedding = embedding
+        logger.info("Embeddings generated", num_chunks=len(chunks))
+        return chunks
+    def embed_chunks_iter(
+        self,
+        chunks: Iterator[Chunk],
+        batch_size: Optional[int] = None,
+    ) -> Iterator[Chunk]:
+        batch_size = batch_size or self.batch_size
+        batch: list[Chunk] = []
+        for chunk in chunks:
+            batch.append(chunk)
+            if len(batch) >= batch_size:
+                yield from self._embed_batch(batch)
+                batch = []
+        if batch:
+            yield from self._embed_batch(batch)
+    def _embed_batch(self, batch: list[Chunk]) -> Iterator[Chunk]:
+        texts = [self._chunk_to_text(chunk) for chunk in batch]
+        embeddings = self.generate_embeddings(texts, is_query=False, show_progress=False)
+        for chunk, embedding in zip(batch, embeddings):
+            chunk.embedding = embedding
+            yield chunk
+    def _chunk_to_text(self, chunk: Chunk) -> str:
+        parts = []
+        if chunk.name:
+            parts.append(f"{chunk.chunk_type.value}: {chunk.name}")
+        if chunk.metadata.signature:
+            parts.append(f"Signature: {chunk.metadata.signature}")
+        if chunk.metadata.docstring:
+            parts.append(f"Docstring: {chunk.metadata.docstring[:200]}")
+        parts.append(f"File: {chunk.file_path}")
+        parts.append(chunk.content)
+        return "\n".join(parts)
+    def unload(self) -> None:
+        if self._model is not None:
+            del self._model
+            self._model = None
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            logger.info("Embedding model unloaded")

coderag/indexing/vectorstore.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""ChromaDB vector store operations."""
+from pathlib import Path
+from typing import Optional
+import chromadb
+from chromadb.config import Settings
+from coderag.config import get_settings
+from coderag.logging import get_logger
+from coderag.models.chunk import Chunk
+logger = get_logger(__name__)
+class VectorStore:
+    """ChromaDB vector store for chunk storage and retrieval."""
+    def __init__(
+        self,
+        persist_directory: Optional[Path] = None,
+        collection_name: Optional[str] = None,
+    ) -> None:
+        settings = get_settings()
+        self.persist_directory = persist_directory or settings.vectorstore.persist_directory
+        self.collection_name = collection_name or settings.vectorstore.collection_name
+        self._client: Optional[chromadb.PersistentClient] = None
+        self._collection: Optional[chromadb.Collection] = None
+    @property
+    def client(self) -> chromadb.PersistentClient:
+        if self._client is None:
+            self._init_client()
+        return self._client
+    @property
+    def collection(self) -> chromadb.Collection:
+        if self._collection is None:
+            self._init_collection()
+        return self._collection
+    def _init_client(self) -> None:
+        logger.info("Initializing ChromaDB", path=str(self.persist_directory))
+        self.persist_directory.mkdir(parents=True, exist_ok=True)
+        self._client = chromadb.PersistentClient(
+            path=str(self.persist_directory),
+            settings=Settings(anonymized_telemetry=False),
+        )
+    def _init_collection(self) -> None:
+        self._collection = self.client.get_or_create_collection(
+            name=self.collection_name,
+            metadata={"hnsw:space": "cosine"},
+        )
+        logger.info("Collection initialized", name=self.collection_name)
+    def add_chunks(self, chunks: list[Chunk]) -> int:
+        if not chunks:
+            return 0
+        ids = [chunk.id for chunk in chunks]
+        embeddings = [chunk.embedding for chunk in chunks if chunk.embedding]
+        documents = [chunk.content for chunk in chunks]
+        metadatas = [chunk.to_dict() for chunk in chunks]
+        # Remove embedding and filter None values (ChromaDB doesn't accept None)
+        cleaned_metadatas = []
+        for m in metadatas:
+            m.pop("embedding", None)
+            m.pop("content", None)  # Already stored in documents
+            # Filter out None values - ChromaDB only accepts str, int, float, bool
+            cleaned = {k: v for k, v in m.items() if v is not None}
+            cleaned_metadatas.append(cleaned)
+        self.collection.add(
+            ids=ids,
+            embeddings=embeddings,
+            documents=documents,
+            metadatas=cleaned_metadatas,
+        )
+        logger.info("Chunks added to vector store", count=len(chunks))
+        return len(chunks)
+    def query(
+        self,
+        query_embedding: list[float],
+        repo_id: str,
+        top_k: int = 5,
+        similarity_threshold: float = 0.0,
+    ) -> list[tuple[Chunk, float]]:
+        results = self.collection.query(
+            query_embeddings=[query_embedding],
+            n_results=top_k,
+            where={"repo_id": repo_id},
+            include=["documents", "metadatas", "distances"],
+        )
+        chunks_with_scores = []
+        if results["ids"] and results["ids"][0]:
+            for i, chunk_id in enumerate(results["ids"][0]):
+                # ChromaDB returns distances, convert to similarity for cosine
+                distance = results["distances"][0][i]
+                similarity = 1 - distance
+                if similarity >= similarity_threshold:
+                    metadata = results["metadatas"][0][i]
+                    metadata["id"] = chunk_id
+                    metadata["content"] = results["documents"][0][i]
+                    chunk = Chunk.from_dict(metadata)
+                    chunks_with_scores.append((chunk, similarity))
+        return chunks_with_scores
+    def delete_repo_chunks(self, repo_id: str) -> int:
+        # Get all chunks for this repo
+        results = self.collection.get(where={"repo_id": repo_id}, include=[])
+        if results["ids"]:
+            self.collection.delete(ids=results["ids"])
+            count = len(results["ids"])
+            logger.info("Deleted repo chunks", repo_id=repo_id, count=count)
+            return count
+        return 0
+    def delete_file_chunks(self, repo_id: str, file_path: str) -> int:
+        """Delete chunks for a specific file in a repository (for incremental updates)."""
+        results = self.collection.get(
+            where={"$and": [{"repo_id": repo_id}, {"file_path": file_path}]},
+            include=[],
+        )
+        if results["ids"]:
+            self.collection.delete(ids=results["ids"])
+            count = len(results["ids"])
+            logger.info("Deleted file chunks", repo_id=repo_id, file_path=file_path, count=count)
+            return count
+        return 0
+    def get_indexed_files(self, repo_id: str) -> set[str]:
+        """Get set of file paths indexed for a repository."""
+        results = self.collection.get(
+            where={"repo_id": repo_id},
+            include=["metadatas"],
+        )
+        files = set()
+        if results["metadatas"]:
+            for metadata in results["metadatas"]:
+                if "file_path" in metadata:
+                    files.add(metadata["file_path"])
+        return files
+    def get_repo_chunk_count(self, repo_id: str) -> int:
+        results = self.collection.get(where={"repo_id": repo_id}, include=[])
+        return len(results["ids"]) if results["ids"] else 0
+    def get_all_repo_ids(self) -> list[str]:
+        results = self.collection.get(include=["metadatas"])
+        repo_ids = set()
+        if results["metadatas"]:
+            for metadata in results["metadatas"]:
+                if "repo_id" in metadata:
+                    repo_ids.add(metadata["repo_id"])
+        return list(repo_ids)
+    def clear(self) -> None:
+        self.client.delete_collection(self.collection_name)
+        self._collection = None
+        logger.info("Collection cleared", name=self.collection_name)

coderag/ingestion/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Ingestion module: Repository loading, file filtering, and semantic chunking."""
+from coderag.ingestion.validator import GitHubURLValidator
+from coderag.ingestion.loader import RepositoryLoader
+from coderag.ingestion.filter import FileFilter
+from coderag.ingestion.chunker import CodeChunker
+__all__ = ["GitHubURLValidator", "RepositoryLoader", "FileFilter", "CodeChunker"]

coderag/ingestion/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (566 Bytes). View file

coderag/ingestion/__pycache__/chunker.cpython-313.pyc ADDED Viewed

Binary file (10.4 kB). View file

coderag/ingestion/__pycache__/filter.cpython-313.pyc ADDED Viewed

Binary file (4.27 kB). View file

coderag/ingestion/__pycache__/loader.cpython-313.pyc ADDED Viewed

Binary file (6.14 kB). View file

coderag/ingestion/__pycache__/validator.cpython-313.pyc ADDED Viewed

Binary file (6.95 kB). View file

coderag/ingestion/chunker.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""Code chunking with Tree-sitter and text fallback."""
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterator, Optional
+from coderag.config import get_settings
+from coderag.logging import get_logger
+from coderag.models.chunk import Chunk, ChunkMetadata, ChunkType
+from coderag.models.document import Document
+logger = get_logger(__name__)
+@dataclass
+class ChunkerConfig:
+    """Chunker configuration."""
+    chunk_size: int = 1500
+    chunk_overlap: int = 200
+    min_chunk_size: int = 50
+class CodeChunker:
+    """Chunks code files into semantic units."""
+    def __init__(self, config: Optional[ChunkerConfig] = None) -> None:
+        settings = get_settings()
+        self.config = config or ChunkerConfig(
+            chunk_size=settings.ingestion.chunk_size,
+            chunk_overlap=settings.ingestion.chunk_overlap,
+        )
+        self._tree_sitter_available = self._check_tree_sitter()
+    def _check_tree_sitter(self) -> bool:
+        try:
+            import tree_sitter_python
+            return True
+        except ImportError:
+            logger.warning("Tree-sitter not available, using text chunking")
+            return False
+    def chunk_document(self, document: Document) -> Iterator[Chunk]:
+        if document.language == "python" and self._tree_sitter_available:
+            yield from self._chunk_python(document)
+        else:
+            yield from self._chunk_text(document)
+    def _chunk_python(self, document: Document) -> Iterator[Chunk]:
+        try:
+            import tree_sitter_python as tspython
+            from tree_sitter import Language, Parser
+            PY_LANGUAGE = Language(tspython.language())
+            parser = Parser(PY_LANGUAGE)
+            tree = parser.parse(bytes(document.content, "utf-8"))
+            yield from self._extract_python_chunks(tree.root_node, document)
+        except Exception as e:
+            logger.warning("Tree-sitter parsing failed, falling back to text", error=str(e))
+            yield from self._chunk_text(document)
+    def _extract_python_chunks(self, node, document: Document) -> Iterator[Chunk]:
+        lines = document.content.split("\n")
+        for child in node.children:
+            if child.type in ("function_definition", "async_function_definition"):
+                yield self._create_chunk_from_node(child, document, lines, ChunkType.FUNCTION)
+            elif child.type == "class_definition":
+                yield self._create_chunk_from_node(child, document, lines, ChunkType.CLASS)
+                # Also extract methods
+                for class_child in child.children:
+                    if class_child.type == "block":
+                        for block_child in class_child.children:
+                            if block_child.type in ("function_definition", "async_function_definition"):
+                                yield self._create_chunk_from_node(
+                                    block_child, document, lines, ChunkType.METHOD,
+                                    parent_name=self._get_node_name(child)
+                                )
+        # If no semantic chunks found, fall back to text chunking
+        if not any(child.type in ("function_definition", "class_definition", "async_function_definition")
+                   for child in node.children):
+            yield from self._chunk_text(document)
+    def _create_chunk_from_node(
+        self,
+        node,
+        document: Document,
+        lines: list[str],
+        chunk_type: ChunkType,
+        parent_name: Optional[str] = None,
+    ) -> Chunk:
+        start_line = node.start_point[0] + 1
+        end_line = node.end_point[0] + 1
+        content = "\n".join(lines[start_line - 1:end_line])
+        name = self._get_node_name(node)
+        signature = self._get_signature(node, lines)
+        docstring = self._get_docstring(node, lines)
+        metadata = ChunkMetadata(
+            file_path=document.file_path,
+            start_line=start_line,
+            end_line=end_line,
+            chunk_type=chunk_type,
+            language=document.language,
+            name=name,
+            signature=signature,
+            docstring=docstring,
+            parent_name=parent_name,
+        )
+        return Chunk(content=content, metadata=metadata, repo_id=document.repo_id)
+    def _get_node_name(self, node) -> Optional[str]:
+        for child in node.children:
+            if child.type == "identifier":
+                return child.text.decode("utf-8")
+        return None
+    def _get_signature(self, node, lines: list[str]) -> Optional[str]:
+        if node.type in ("function_definition", "async_function_definition"):
+            start_line = node.start_point[0]
+            return lines[start_line].strip()
+        return None
+    def _get_docstring(self, node, lines: list[str]) -> Optional[str]:
+        for child in node.children:
+            if child.type == "block":
+                for block_child in child.children:
+                    if block_child.type == "expression_statement":
+                        for expr_child in block_child.children:
+                            if expr_child.type == "string":
+                                return expr_child.text.decode("utf-8").strip('"""\'\'\'')
+        return None
+    def _chunk_text(self, document: Document) -> Iterator[Chunk]:
+        lines = document.content.split("\n")
+        chunk_size = self.config.chunk_size
+        overlap = self.config.chunk_overlap
+        current_start = 0
+        while current_start < len(lines):
+            # Calculate chunk boundaries
+            char_count = 0
+            end_line = current_start
+            while end_line < len(lines) and char_count < chunk_size:
+                char_count += len(lines[end_line]) + 1
+                end_line += 1
+            content = "\n".join(lines[current_start:end_line])
+            if len(content.strip()) >= self.config.min_chunk_size:
+                metadata = ChunkMetadata(
+                    file_path=document.file_path,
+                    start_line=current_start + 1,
+                    end_line=end_line,
+                    chunk_type=ChunkType.TEXT,
+                    language=document.language,
+                )
+                yield Chunk(content=content, metadata=metadata, repo_id=document.repo_id)
+            # Move start with overlap
+            overlap_lines = 0
+            overlap_chars = 0
+            while overlap_lines < end_line - current_start and overlap_chars < overlap:
+                overlap_chars += len(lines[end_line - 1 - overlap_lines]) + 1
+                overlap_lines += 1
+            current_start = end_line - overlap_lines
+            if current_start <= 0 or end_line >= len(lines):
+                break
+    def chunk_files(self, documents: Iterator[Document]) -> Iterator[Chunk]:
+        total_chunks = 0
+        for doc in documents:
+            doc_chunks = 0
+            for chunk in self.chunk_document(doc):
+                doc_chunks += 1
+                total_chunks += 1
+                yield chunk
+            logger.debug("Document chunked", file=doc.file_path, chunks=doc_chunks)
+        logger.info("Chunking complete", total_chunks=total_chunks)

coderag/ingestion/filter.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""File filtering for repository indexing."""
+import fnmatch
+from pathlib import Path
+from typing import Iterator, Optional
+from coderag.config import get_settings
+from coderag.logging import get_logger
+logger = get_logger(__name__)
+class FileFilter:
+    """Filters files for indexing based on patterns."""
+    def __init__(
+        self,
+        include_patterns: Optional[list[str]] = None,
+        exclude_patterns: Optional[list[str]] = None,
+        max_file_size_kb: Optional[int] = None,
+    ) -> None:
+        settings = get_settings()
+        self.include_patterns = include_patterns or settings.ingestion.include_patterns
+        self.exclude_patterns = exclude_patterns or settings.ingestion.exclude_patterns
+        self.max_file_size = (max_file_size_kb or settings.ingestion.max_file_size_kb) * 1024
+    def should_include(self, file_path: Path, repo_root: Path) -> bool:
+        relative_path = str(file_path.relative_to(repo_root))
+        # Check exclusions first
+        for pattern in self.exclude_patterns:
+            if fnmatch.fnmatch(relative_path, pattern):
+                return False
+            if fnmatch.fnmatch(file_path.name, pattern):
+                return False
+        # Check inclusions
+        for pattern in self.include_patterns:
+            if fnmatch.fnmatch(file_path.name, pattern):
+                return True
+            if fnmatch.fnmatch(relative_path, pattern):
+                return True
+        return False
+    def check_file_size(self, file_path: Path) -> bool:
+        try:
+            return file_path.stat().st_size <= self.max_file_size
+        except OSError:
+            return False
+    def is_binary(self, file_path: Path) -> bool:
+        try:
+            with open(file_path, "rb") as f:
+                chunk = f.read(8192)
+                return b"\x00" in chunk
+        except (OSError, IOError):
+            return True
+    def filter_files(self, repo_root: Path) -> Iterator[Path]:
+        skipped_count = 0
+        included_count = 0
+        for file_path in repo_root.rglob("*"):
+            if not file_path.is_file():
+                continue
+            if not self.should_include(file_path, repo_root):
+                skipped_count += 1
+                continue
+            if not self.check_file_size(file_path):
+                logger.debug("Skipping large file", path=str(file_path))
+                skipped_count += 1
+                continue
+            if self.is_binary(file_path):
+                logger.debug("Skipping binary file", path=str(file_path))
+                skipped_count += 1
+                continue
+            included_count += 1
+            yield file_path
+        logger.info("File filtering complete", included=included_count, skipped=skipped_count)

coderag/ingestion/loader.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Repository loading and cloning."""
+from pathlib import Path
+from typing import Callable, Optional
+from git import Repo, GitCommandError
+from coderag.config import get_settings
+from coderag.logging import get_logger
+from coderag.ingestion.validator import GitHubRepoInfo
+logger = get_logger(__name__)
+ProgressCallback = Callable[[str, int], None]
+class LoaderError(Exception):
+    """Repository loading error."""
+    pass
+class RepositoryLoader:
+    """Loads repositories from GitHub."""
+    def __init__(self, cache_dir: Optional[Path] = None) -> None:
+        settings = get_settings()
+        self.cache_dir = cache_dir or settings.ingestion.repos_cache_dir
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+    def get_repo_path(self, repo_info: GitHubRepoInfo) -> Path:
+        return self.cache_dir / repo_info.owner / repo_info.name
+    def clone_repository(
+        self,
+        repo_info: GitHubRepoInfo,
+        branch: Optional[str] = None,
+        progress_callback: Optional[ProgressCallback] = None,
+    ) -> Path:
+        repo_path = self.get_repo_path(repo_info)
+        # Try branches in order: specified, repo default, main, master
+        branches_to_try = []
+        if branch:
+            branches_to_try.append(branch)
+        if repo_info.branch and repo_info.branch not in branches_to_try:
+            branches_to_try.append(repo_info.branch)
+        if "main" not in branches_to_try:
+            branches_to_try.append("main")
+        if "master" not in branches_to_try:
+            branches_to_try.append("master")
+        if repo_path.exists():
+            logger.info("Repository exists, updating", path=str(repo_path))
+            return self._update_repository(repo_path, branches_to_try[0], progress_callback)
+        if progress_callback:
+            progress_callback("Cloning repository", 0)
+        repo_path.parent.mkdir(parents=True, exist_ok=True)
+        last_error = None
+        for try_branch in branches_to_try:
+            try:
+                logger.info("Trying to clone", url=repo_info.clone_url, branch=try_branch)
+                Repo.clone_from(
+                    repo_info.clone_url,
+                    repo_path,
+                    branch=try_branch,
+                    depth=1,
+                    single_branch=True,
+                )
+                if progress_callback:
+                    progress_callback("Clone complete", 100)
+                logger.info("Repository cloned", path=str(repo_path), branch=try_branch)
+                return repo_path
+            except GitCommandError as e:
+                last_error = e
+                logger.debug("Branch not found, trying next", branch=try_branch)
+                # Clean up partial clone if any
+                import shutil
+                shutil.rmtree(repo_path, ignore_errors=True)
+                continue
+        raise LoaderError(f"Failed to clone repository (tried branches: {branches_to_try}): {last_error}")
+    def _update_repository(
+        self,
+        repo_path: Path,
+        branch: str,
+        progress_callback: Optional[ProgressCallback] = None,
+    ) -> Path:
+        try:
+            repo = Repo(repo_path)
+            if progress_callback:
+                progress_callback("Fetching updates", 30)
+            repo.remotes.origin.fetch()
+            repo.git.checkout(branch)
+            repo.remotes.origin.pull()
+            if progress_callback:
+                progress_callback("Update complete", 100)
+            logger.info("Repository updated", path=str(repo_path))
+            return repo_path
+        except GitCommandError as e:
+            logger.warning("Update failed, re-cloning", error=str(e))
+            import shutil
+            shutil.rmtree(repo_path, ignore_errors=True)
+            raise LoaderError(f"Failed to update, please re-clone: {e}")
+    def is_cached(self, repo_info: GitHubRepoInfo) -> bool:
+        return self.get_repo_path(repo_info).exists()
+    def delete_cache(self, repo_info: GitHubRepoInfo) -> None:
+        repo_path = self.get_repo_path(repo_info)
+        if repo_path.exists():
+            import shutil
+            shutil.rmtree(repo_path)
+            logger.info("Cache deleted", path=str(repo_path))

coderag/ingestion/validator.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""GitHub URL validation and parsing."""
+import re
+from dataclasses import dataclass
+from typing import Optional
+import httpx
+from coderag.logging import get_logger
+logger = get_logger(__name__)
+@dataclass
+class GitHubRepoInfo:
+    """Parsed GitHub repository information."""
+    owner: str
+    name: str
+    url: str
+    branch: Optional[str] = None
+    @property
+    def full_name(self) -> str:
+        return f"{self.owner}/{self.name}"
+    @property
+    def clone_url(self) -> str:
+        return f"https://github.com/{self.owner}/{self.name}.git"
+    @property
+    def api_url(self) -> str:
+        return f"https://api.github.com/repos/{self.owner}/{self.name}"
+class ValidationError(Exception):
+    """URL validation error."""
+    pass
+class GitHubURLValidator:
+    """Validates and parses GitHub repository URLs."""
+    GITHUB_PATTERNS = [
+        r"^https?://github\.com/(?P<owner>[^/]+)/(?P<name>[^/]+?)(?:\.git)?/?$",
+        r"^git@github\.com:(?P<owner>[^/]+)/(?P<name>[^/]+?)(?:\.git)?$",
+        r"^(?P<owner>[a-zA-Z0-9](?:[a-zA-Z0-9]|-(?=[a-zA-Z0-9])){0,38})/(?P<name>[a-zA-Z0-9._-]+)$",
+    ]
+    def __init__(self, timeout: float = 10.0) -> None:
+        self.timeout = timeout
+        self._patterns = [re.compile(p) for p in self.GITHUB_PATTERNS]
+    def parse_url(self, url: str) -> GitHubRepoInfo:
+        url = url.strip()
+        for pattern in self._patterns:
+            match = pattern.match(url)
+            if match:
+                owner = match.group("owner")
+                name = match.group("name").rstrip(".git")
+                if not self._is_valid_name(owner) or not self._is_valid_name(name):
+                    raise ValidationError(f"Invalid owner or repository name: {url}")
+                return GitHubRepoInfo(owner=owner, name=name, url=f"https://github.com/{owner}/{name}")
+        raise ValidationError(f"Invalid GitHub URL: {url}. Expected: https://github.com/owner/repo")
+    def _is_valid_name(self, name: str) -> bool:
+        if not name or len(name) > 100:
+            return False
+        return bool(re.match(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$", name))
+    async def validate_repository(self, url: str, check_accessibility: bool = True) -> GitHubRepoInfo:
+        repo_info = self.parse_url(url)
+        if check_accessibility:
+            await self._check_repo_accessible(repo_info)
+        logger.info("Repository validated", owner=repo_info.owner, name=repo_info.name)
+        return repo_info
+    async def _check_repo_accessible(self, repo_info: GitHubRepoInfo) -> None:
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            try:
+                response = await client.get(repo_info.api_url)
+                if response.status_code == 404:
+                    raise ValidationError(f"Repository not found: {repo_info.full_name}")
+                elif response.status_code == 403:
+                    raise ValidationError(f"Access denied: {repo_info.full_name}")
+                elif response.status_code != 200:
+                    raise ValidationError(f"HTTP error {response.status_code}: {repo_info.full_name}")
+                data = response.json()
+                if data.get("private", False):
+                    raise ValidationError(f"Private repository not supported: {repo_info.full_name}")
+                repo_info.branch = data.get("default_branch", "main")
+            except httpx.TimeoutException:
+                raise ValidationError(f"Timeout checking repository: {repo_info.full_name}")
+            except httpx.RequestError as e:
+                raise ValidationError(f"Network error: {str(e)}")
+    def validate_url_sync(self, url: str) -> GitHubRepoInfo:
+        return self.parse_url(url)

coderag/logging.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Structured logging configuration using structlog."""
+import logging
+import sys
+from typing import Any
+import structlog
+from structlog.types import Processor
+def setup_logging(
+    level: str = "INFO",
+    json_format: bool = False,
+    log_file: str | None = None,
+) -> None:
+    """Configure structured logging for the application.
+    Args:
+        level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        json_format: If True, output logs as JSON (for production)
+        log_file: Optional file path for logging output
+    """
+    # Configure standard library logging
+    logging.basicConfig(
+        format="%(message)s",
+        stream=sys.stdout,
+        level=getattr(logging, level.upper()),
+    )
+    # Add file handler if specified
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(getattr(logging, level.upper()))
+        logging.getLogger().addHandler(file_handler)
+    # Shared processors for all outputs
+    shared_processors: list[Processor] = [
+        structlog.contextvars.merge_contextvars,
+        structlog.stdlib.add_log_level,
+        structlog.stdlib.add_logger_name,
+        structlog.stdlib.PositionalArgumentsFormatter(),
+        structlog.processors.TimeStamper(fmt="iso"),
+        structlog.processors.StackInfoRenderer(),
+        structlog.processors.UnicodeDecoder(),
+    ]
+    if json_format:
+        # Production: JSON output
+        processors: list[Processor] = [
+            *shared_processors,
+            structlog.processors.format_exc_info,
+            structlog.processors.JSONRenderer(),
+        ]
+    else:
+        # Development: Colored console output
+        processors = [
+            *shared_processors,
+            structlog.dev.ConsoleRenderer(colors=True),
+        ]
+    structlog.configure(
+        processors=processors,
+        wrapper_class=structlog.stdlib.BoundLogger,
+        context_class=dict,
+        logger_factory=structlog.stdlib.LoggerFactory(),
+        cache_logger_on_first_use=True,
+    )
+def get_logger(name: str | None = None) -> structlog.stdlib.BoundLogger:
+    """Get a structured logger instance.
+    Args:
+        name: Logger name (usually __name__ of the calling module)
+    Returns:
+        Configured structlog logger
+    """
+    return structlog.get_logger(name)
+class LogContext:
+    """Context manager for adding temporary context to logs."""
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize with context variables."""
+        self.context = kwargs
+        self._token: Any = None
+    def __enter__(self) -> "LogContext":
+        """Bind context variables."""
+        self._token = structlog.contextvars.bind_contextvars(**self.context)
+        return self
+    def __exit__(self, *args: Any) -> None:
+        """Unbind context variables."""
+        structlog.contextvars.unbind_contextvars(*self.context.keys())
+def log_operation(
+    operation: str,
+    **kwargs: Any,
+) -> LogContext:
+    """Create a logging context for an operation.
+    Usage:
+        with log_operation("indexing", repo_id="123"):
+            # All logs within this block will include repo_id
+            logger.info("Starting indexing")
+    """
+    return LogContext(operation=operation, **kwargs)

coderag/main.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""CodeRAG main application entry point."""
+from contextlib import asynccontextmanager
+import uvicorn
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from coderag.config import get_settings
+from coderag.logging import setup_logging, get_logger
+# Initialize settings and logging
+settings = get_settings()
+setup_logging(level=settings.server.log_level.upper())
+logger = get_logger(__name__)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan handler."""
+    logger.info(
+        "Starting CodeRAG",
+        app_name=settings.app_name,
+        version=settings.app_version,
+        debug=settings.debug,
+    )
+    yield
+    logger.info("Shutting down CodeRAG")
+def create_app() -> FastAPI:
+    """Create and configure the FastAPI application."""
+    app = FastAPI(
+        title=settings.app_name,
+        version=settings.app_version,
+        description="RAG-based Q&A system for code repositories with verifiable citations",
+        docs_url="/docs",
+        redoc_url="/redoc",
+        lifespan=lifespan,
+    )
+    # CORS middleware
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    # Health check endpoint
+    @app.get("/health")
+    async def health_check() -> dict:
+        """Health check endpoint."""
+        return {
+            "status": "healthy",
+            "app": settings.app_name,
+            "version": settings.app_version,
+        }
+    # Register API routes
+    from coderag.api.routes import router as api_router
+    app.include_router(api_router, prefix="/api/v1")
+    # Mount MCP server
+    try:
+        from coderag.mcp.server import create_mcp_server
+        mcp_server = create_mcp_server()
+        mcp_app = mcp_server.streamable_http_app()
+        app.mount("/mcp", mcp_app)
+        logger.info("MCP server mounted at /mcp")
+    except ImportError as e:
+        logger.warning("MCP server not available", error=str(e))
+    except Exception as e:
+        logger.error("Failed to mount MCP server", error=str(e))
+    # Mount Gradio UI
+    try:
+        from coderag.ui.app import create_gradio_app
+        import gradio as gr
+        gradio_app = create_gradio_app()
+        app = gr.mount_gradio_app(app, gradio_app, path="/")
+        logger.info("Gradio UI mounted at /")
+    except ImportError as e:
+        logger.warning("Gradio UI not available", error=str(e))
+    except Exception as e:
+        logger.error("Failed to mount Gradio UI", error=str(e))
+    return app
+def main() -> None:
+    """Run the application."""
+    app = create_app()
+    logger.info(
+        "Starting server",
+        host=settings.server.host,
+        port=settings.server.port,
+    )
+    uvicorn.run(
+        app,
+        host=settings.server.host,
+        port=settings.server.port,
+        reload=settings.server.reload,
+        workers=settings.server.workers,
+        log_level=settings.server.log_level,
+    )
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        logger.info("Application interrupted by user")
+    except Exception as e:
+        logger.error("Application crashed", error=str(e), exc_info=True)
+        import traceback
+        print("\n" + "="*80)
+        print("FATAL ERROR:")
+        print("="*80)
+        traceback.print_exc()
+        print("="*80)
+        input("Press Enter to close...")  # Keep terminal open

coderag/mcp/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""MCP (Model Context Protocol) server for CodeRAG."""
+from coderag.mcp.handlers import MCPHandlers, get_mcp_handlers
+from coderag.mcp.server import create_mcp_server, mcp
+__all__ = [
+    "MCPHandlers",
+    "get_mcp_handlers",
+    "create_mcp_server",
+    "mcp",
+]

coderag/mcp/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (434 Bytes). View file

coderag/mcp/__pycache__/cli.cpython-313.pyc ADDED Viewed

Binary file (1.23 kB). View file

coderag/mcp/__pycache__/handlers.cpython-313.pyc ADDED Viewed

Binary file (23.9 kB). View file

coderag/mcp/__pycache__/prompts.cpython-313.pyc ADDED Viewed

Binary file (4.72 kB). View file

coderag/mcp/__pycache__/resources.cpython-313.pyc ADDED Viewed

Binary file (1.56 kB). View file

coderag/mcp/__pycache__/server.cpython-313.pyc ADDED Viewed

Binary file (1.52 kB). View file

coderag/mcp/__pycache__/tools.cpython-313.pyc ADDED Viewed

Binary file (5.03 kB). View file

coderag/mcp/cli.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""CLI entry point for running MCP server in stdio mode."""
+import sys
+import os
+# Suppress all stdout output except MCP protocol
+os.environ["PYTHONUNBUFFERED"] = "1"
+# Redirect any stray prints to stderr
+import io
+_original_stdout = sys.stdout
+def main():
+    """Run the MCP server in stdio mode for Claude Desktop."""
+    # Suppress logging to stdout - redirect to stderr
+    import logging
+    logging.basicConfig(
+        level=logging.WARNING,
+        stream=sys.stderr,
+        format="%(message)s"
+    )
+    # Suppress structlog output
+    import structlog
+    structlog.configure(
+        wrapper_class=structlog.make_filtering_bound_logger(logging.CRITICAL),
+    )
+    from coderag.mcp.server import create_mcp_server
+    mcp = create_mcp_server()
+    mcp.run(transport="stdio")
+if __name__ == "__main__":
+    main()