fixes CodeGraphContext#758

Shashankss1205 · Shashankss1205 · commit 3e9888f4fe22 · 2026-05-01T15:14:13.000+05:30
diff --git a/src/codegraphcontext/cli/config_manager.py b/src/codegraphcontext/cli/config_manager.py
@@ -47,6 +47,11 @@
     "SCIP_INDEXER": "false",
     "SCIP_LANGUAGES": "python,typescript,go,rust,java",
     "SKIP_EXTERNAL_RESOLUTION": "false",
+    # 0 = unlimited; any positive integer caps MCP tool response size.
+    "MAX_TOOL_RESPONSE_TOKENS": "0",
+    # JSON object mapping tool names to integer result-count limits.
+    # Example: {"find_code": 20, "analyze_code_relationships": 10, "find_dead_code": 30}
+    "TOOL_RESULT_LIMITS": "{}",
 }
 
 # Configuration key descriptions
@@ -74,6 +79,8 @@
     "SCIP_INDEXER": "Use SCIP-based indexing for higher accuracy call/inheritance resolution (requires scip-<lang> tools installed)",
     "SCIP_LANGUAGES": "Comma-separated languages to index via SCIP when SCIP_INDEXER=true (python,typescript,go,rust,java)",
     "SKIP_EXTERNAL_RESOLUTION": "Skip resolution attempts for external library method calls (recommended for enterprise large Java/Spring codebases)",
+    "MAX_TOOL_RESPONSE_TOKENS": "Maximum tokens per MCP tool response (0 = unlimited). Truncates oversized payloads and appends a notice.",
+    "TOOL_RESULT_LIMITS": "JSON object mapping tool names to max result counts, e.g. {\"find_code\": 20, \"analyze_code_relationships\": 10}. Missing keys use built-in defaults.",
 }
 
 # Valid values for each config key
@@ -342,6 +349,26 @@ def validate_config_value(key: str, value: str) -> tuple[bool, Optional[str]]:
                 return False, "PARALLEL_WORKERS must be between 1 and 32"
         except ValueError:
             return False, "PARALLEL_WORKERS must be a number"
+
+    if key == "MAX_TOOL_RESPONSE_TOKENS":
+        try:
+            limit = int(value)
+            if limit < 0:
+                return False, "MAX_TOOL_RESPONSE_TOKENS must be 0 (unlimited) or a positive integer"
+        except ValueError:
+            return False, "MAX_TOOL_RESPONSE_TOKENS must be an integer (0 = unlimited)"
+
+    if key == "TOOL_RESULT_LIMITS":
+        import json as _json
+        try:
+            parsed = _json.loads(value)
+            if not isinstance(parsed, dict):
+                return False, "TOOL_RESULT_LIMITS must be a JSON object, e.g. {\"find_code\": 20}"
+            for k, v in parsed.items():
+                if not isinstance(v, int) or v < 1:
+                    return False, f"TOOL_RESULT_LIMITS: value for '{k}' must be a positive integer"
+        except _json.JSONDecodeError:
+            return False, "TOOL_RESULT_LIMITS must be valid JSON, e.g. {\"find_code\": 20, \"find_dead_code\": 30}"
     
     if key == "MAX_DEPTH":
         if value.lower() != "unlimited":
diff --git a/src/codegraphcontext/server.py b/src/codegraphcontext/server.py
@@ -75,6 +75,45 @@ def _strip_workspace_prefix(obj):
     return obj
 
 
+# Approximate chars-per-token used for budget conversion.
+# GPT-family tokenizers average ~4 chars/token; using 4 is a safe conservative estimate.
+_CHARS_PER_TOKEN = 4
+
+
+def _apply_response_token_limit(tool_name: str, text: str) -> str:
+    """Truncate *text* to the configured token budget and append a notice.
+
+    Reads ``MAX_TOOL_RESPONSE_TOKENS`` from the CGC config at call time so
+    that live config changes are respected without a server restart.
+    Returns *text* unchanged when the limit is 0 (unlimited) or not set.
+    """
+    from .cli.config_manager import get_config_value
+
+    raw = get_config_value("MAX_TOOL_RESPONSE_TOKENS") or "0"
+    try:
+        max_tokens = int(raw)
+    except ValueError:
+        max_tokens = 0
+
+    if max_tokens <= 0:
+        return text  # unlimited
+
+    max_chars = max_tokens * _CHARS_PER_TOKEN
+    if len(text) <= max_chars:
+        return text
+
+    notice = (
+        f"\n\n[CGC] Response truncated: output exceeded the MAX_TOOL_RESPONSE_TOKENS "
+        f"limit of {max_tokens} tokens (tool: {tool_name}). "
+        "Increase MAX_TOOL_RESPONSE_TOKENS or narrow your query for full results."
+    )
+    # Reserve space for the notice inside the budget
+    budget = max_chars - len(notice)
+    if budget < 0:
+        budget = 0
+    return text[:budget] + notice
+
+
 
 class MCPServer:
     """
@@ -506,9 +545,11 @@ async def run(self):
                             "error": {"code": -32000, "message": "Tool execution error", "data": result}
                         }
                     else:
+                        response_text = json.dumps(result, indent=2)
+                        response_text = _apply_response_token_limit(tool_name, response_text)
                         response = {
                             "jsonrpc": "2.0", "id": request_id,
-                            "result": {"content": [{"type": "text", "text": json.dumps(result, indent=2)}]}
+                            "result": {"content": [{"type": "text", "text": response_text}]}
                         }
                 elif method == 'notifications/initialized':
                     # This is a notification, no response needed.
diff --git a/src/codegraphcontext/tools/code_finder.py b/src/codegraphcontext/tools/code_finder.py
@@ -24,6 +24,20 @@ def _levenshtein_distance(a: str, b: str) -> int:
         prev = curr
     return prev[-1]
 
+
+def _normalize_identifier(s: str) -> str:
+    """Lowercase and strip separator chars so camelCase / snake_case / spaces
+    all compare on equal footing.
+
+    Examples::
+
+        _normalize_identifier('myFunction')   -> 'myfunction'
+        _normalize_identifier('my_function')  -> 'myfunction'
+        _normalize_identifier('my function')  -> 'myfunction'
+        _normalize_identifier('MyFunc tion')  -> 'myfunction'
+    """
+    return s.lower().replace('_', '').replace(' ', '')
+
 class CodeFinder:
     """Module for finding relevant code snippets and analyzing relationships."""
 
@@ -64,11 +78,18 @@ def _find_by_name_fuzzy_portable(
         edit_distance: int,
         repo_path: Optional[str],
     ) -> List[Dict]:
-        """Fuzzy name match for backends without Lucene fuzzy syntax (Kùzu, FalkorDB, …)."""
+        """Fuzzy name match for backends without Lucene fuzzy syntax (Kùzu, FalkorDB, …).
+
+        Compares both the raw query and its identifier-normalised form against each
+        candidate name, taking the minimum distance.  This lets camelCase queries
+        match snake_case stored names and vice-versa without inflating the distance.
+        """
         if not search_term.strip():
             return []
         where_clause = "WHERE node.path STARTS WITH $repo_path" if repo_path else ""
-        limit_tail = "" if repo_path else " LIMIT 8000"
+        # Without a repo filter we must cap the candidate scan.  20 000 is enough to
+        # cover any realistic single-repo codebase while keeping latency acceptable.
+        limit_tail = "" if repo_path else " LIMIT 20000"
         params: Dict[str, Any] = {}
         if repo_path:
             params["repo_path"] = repo_path
@@ -81,13 +102,27 @@ def _find_by_name_fuzzy_portable(
         """
         with self.driver.session() as session:
             rows = session.run(query, **params).data()
-        q = search_term.lower()
+
+        # Two query forms:
+        #   q_raw  – lowercased original (e.g. "myFuncton" → "myfuncton")
+        #   q_norm – separator-stripped  (e.g. "my_functon" → "myfuncton")
+        # Using the minimum of both distances avoids the space-inflation bug where
+        # the handler's replace('_', ' ') turns "my_functon" into "my functon",
+        # which compares poorly against camelCase stored names.
+        q_raw = search_term.lower()
+        q_norm = _normalize_identifier(search_term)
+
         scored: List[tuple[int, Dict]] = []
         for row in rows:
             nm = row.get("name")
             if not isinstance(nm, str):
                 continue
-            d = _levenshtein_distance(q, nm.lower())
+            nm_lower = nm.lower()
+            nm_norm = _normalize_identifier(nm)
+            d = min(
+                _levenshtein_distance(q_raw, nm_lower),
+                _levenshtein_distance(q_norm, nm_norm),
+            )
             if d <= edit_distance:
                 scored.append((d, row))
         scored.sort(key=lambda x: x[0])
@@ -256,14 +291,24 @@ def find_imports(self, search_term: str) -> List[Dict]:
 
     def find_related_code(self, user_query: str, fuzzy_search: bool, edit_distance: int, repo_path: Optional[str] = None) -> Dict[str, Any]:
         """Find code related to a query using multiple search strategies"""
-        # Neo4j full-text uses Lucene fuzzy tokens (e.g. name:foo~2). Kùzu/FalkorDB use
-        # portable Levenshtein over candidate names instead.
-        lucene_fuzzy_query = (
-            " ".join(f"{t}~{edit_distance}" for t in user_query.split())
-            if fuzzy_search and not self._lacks_native_fulltext
-            else user_query
-        )
-        name_lookup_q = lucene_fuzzy_query if (fuzzy_search and not self._lacks_native_fulltext) else user_query
+        # For Lucene backends: split snake_case/underscore tokens so Lucene sees
+        # individual words, then append the fuzzy modifier.
+        # For portable backends: keep user_query verbatim — _find_by_name_fuzzy_portable
+        # handles normalisation via _normalize_identifier.
+        if fuzzy_search and not self._lacks_native_fulltext:
+            lucene_base = user_query.replace("_", " ").strip()
+            lucene_fuzzy_query = " ".join(f"{t}~{edit_distance}" for t in lucene_base.split())
+        else:
+            lucene_fuzzy_query = user_query
+
+        # For portable backends, always pass the *original* query to the fuzzy name
+        # matcher — _find_by_name_fuzzy_portable applies its own normalisation.
+        # For Lucene-capable backends, use the Lucene fuzzy token form.
+        if self._lacks_native_fulltext:
+            name_lookup_q = user_query
+        else:
+            name_lookup_q = lucene_fuzzy_query if fuzzy_search else user_query
+
         content_lookup_q = lucene_fuzzy_query if (fuzzy_search and not self._lacks_native_fulltext) else user_query
 
         results: Dict[str, Any] = {
diff --git a/src/codegraphcontext/tools/handlers/analysis_handlers.py b/src/codegraphcontext/tools/handlers/analysis_handlers.py
@@ -1,6 +1,8 @@
 from typing import Any, Dict
 from ..code_finder import CodeFinder
 from ...utils.debug_log import debug_log
+from ...utils.tool_limits import get_tool_result_limit
+
 
 def find_dead_code(code_finder: CodeFinder, **args) -> Dict[str, Any]:
     """Tool to find potentially dead code across the entire project."""
@@ -9,16 +11,25 @@ def find_dead_code(code_finder: CodeFinder, **args) -> Dict[str, Any]:
     try:
         debug_log(f"Finding dead code. repo_path={repo_path}")
         results = code_finder.find_dead_code(exclude_decorated_with=exclude_decorated_with, repo_path=repo_path)
-        
+
+        limit = get_tool_result_limit("find_dead_code")
+        unused = results.get("potentially_unused_functions", [])
+        truncated = False
+        if limit and len(unused) > limit:
+            unused = unused[:limit]
+            truncated = True
+
         return {
             "success": True,
             "query_type": "dead_code",
-            "results": results
+            "results": {**results, "potentially_unused_functions": unused},
+            **({"result_limit": limit, "truncated": truncated} if truncated else {}),
         }
     except Exception as e:
         debug_log(f"Error finding dead code: {str(e)}")
         return {"error": f"Failed to find dead code: {str(e)}"}
 
+
 def calculate_cyclomatic_complexity(code_finder: CodeFinder, **args) -> Dict[str, Any]:
     """Tool to calculate cyclomatic complexity for a given function."""
     function_name = args.get("function_name")
@@ -28,23 +39,24 @@ def calculate_cyclomatic_complexity(code_finder: CodeFinder, **args) -> Dict[str
     try:
         debug_log(f"Calculating cyclomatic complexity for function: {function_name}, repo_path={repo_path}")
         results = code_finder.get_cyclomatic_complexity(function_name, path, repo_path=repo_path)
-        
+
         response = {
             "success": True,
             "function_name": function_name,
             "results": results
         }
         if path:
             response["path"] = path
-        
+
         return response
     except Exception as e:
         debug_log(f"Error calculating cyclomatic complexity: {str(e)}")
         return {"error": f"Failed to calculate cyclomatic complexity: {str(e)}"}
 
+
 def find_most_complex_functions(code_finder: CodeFinder, **args) -> Dict[str, Any]:
     """Tool to find the most complex functions."""
-    limit = args.get("limit", 10)
+    limit = get_tool_result_limit("find_most_complex_functions", default=args.get("limit", 10))
     repo_path = args.get("repo_path")
     try:
         debug_log(f"Finding the top {limit} most complex functions. repo_path={repo_path}")
@@ -58,6 +70,7 @@ def find_most_complex_functions(code_finder: CodeFinder, **args) -> Dict[str, An
         debug_log(f"Error finding most complex functions: {str(e)}")
         return {"error": f"Failed to find most complex functions: {str(e)}"}
 
+
 def analyze_code_relationships(code_finder: CodeFinder, **args) -> Dict[str, Any]:
     """Tool to analyze code relationships"""
     query_type = args.get("query_type")
@@ -74,41 +87,65 @@ def analyze_code_relationships(code_finder: CodeFinder, **args) -> Dict[str, Any
                 "module_deps", "variable_scope", "find_complexity", "find_functions_by_argument", "find_functions_by_decorator"
             ]
         }
-    
+
     try:
         debug_log(f"Analyzing relationships: {query_type} for {target}, repo_path={repo_path}")
         results = code_finder.analyze_code_relationships(query_type, target, context, repo_path=repo_path)
-        
-        return {
+
+        # Apply per-query-type limit (falls back to tool-level limit)
+        limit = get_tool_result_limit(query_type) or get_tool_result_limit("analyze_code_relationships")
+        truncated = False
+        if limit and isinstance(results, list) and len(results) > limit:
+            results = results[:limit]
+            truncated = True
+
+        response = {
             "success": True, "query_type": query_type, "target": target,
-            "context": context, "results": results
+            "context": context, "results": results,
         }
-    
+        if truncated:
+            response["result_limit"] = limit
+            response["truncated"] = True
+        return response
+
     except Exception as e:
         debug_log(f"Error analyzing relationships: {str(e)}")
         return {"error": f"Failed to analyze relationships: {str(e)}"}
 
+
 def find_code(code_finder: CodeFinder, **args) -> Dict[str, Any]:
     """Tool to find relevant code snippets"""
     query = args.get("query")
     DEFAULT_EDIT_DISTANCE = 2
     DEFAULT_FUZZY_SEARCH = False
-    
+
     fuzzy_search = args.get("fuzzy_search", DEFAULT_FUZZY_SEARCH)
     edit_distance = args.get("edit_distance", DEFAULT_EDIT_DISTANCE)
     repo_path = args.get("repo_path")
 
     if fuzzy_search:
-        # Preserve case for Lucene / Levenshtein name matching; lowercasing breaks
-        # camelCase fuzzy hits (see GH #758).
-        query = query.replace("_", " ").strip()
-        
+        # For Lucene backends the replace('_', ' ') improves token splitting.
+        # For portable (Kùzu/FalkorDB) backends _find_by_name_fuzzy_portable
+        # handles normalisation internally, so we leave the query as-is here.
+        pass  # transformation deferred to find_related_code / _find_by_name_fuzzy_portable
+
     try:
         debug_log(f"Finding code for query: {query} with fuzzy_search={fuzzy_search}, edit_distance={edit_distance}, repo_path={repo_path}")
         results = code_finder.find_related_code(query, fuzzy_search, edit_distance, repo_path=repo_path)
 
-        return {"success": True, "query": query, "results": results}
-    
+        limit = get_tool_result_limit("find_code")
+        ranked = results.get("ranked_results", [])
+        truncated = False
+        if limit and len(ranked) > limit:
+            ranked = ranked[:limit]
+            truncated = True
+
+        response = {"success": True, "query": query, "results": {**results, "ranked_results": ranked}}
+        if truncated:
+            response["result_limit"] = limit
+            response["truncated"] = True
+        return response
+
     except Exception as e:
         debug_log(f"Error finding code: {str(e)}")
         return {"error": f"Failed to find code: {str(e)}"}
diff --git a/src/codegraphcontext/tools/handlers/management_handlers.py b/src/codegraphcontext/tools/handlers/management_handlers.py
@@ -3,6 +3,7 @@
 from datetime import datetime
 from ...core.jobs import JobManager, JobStatus
 from ...utils.debug_log import debug_log
+from ...utils.tool_limits import get_tool_result_limit
 from ..code_finder import CodeFinder
 from ..graph_builder import GraphBuilder
 
@@ -241,14 +242,24 @@ def search_registry_bundles(code_finder: CodeFinder, **args) -> Dict[str, Any]:
         
         # Sort by name
         bundles.sort(key=lambda b: (b.get('name', ''), b.get('full_name', '')))
-        
-        return {
+
+        limit = get_tool_result_limit("search_registry_bundles")
+        truncated = False
+        if limit and len(bundles) > limit:
+            bundles = bundles[:limit]
+            truncated = True
+
+        response = {
             "success": True,
             "bundles": bundles,
             "total": len(bundles),
             "query": query if query else "all",
-            "unique_only": unique_only
+            "unique_only": unique_only,
         }
+        if truncated:
+            response["result_limit"] = limit
+            response["truncated"] = True
+        return response
     
     except Exception as e:
         debug_log(f"Error searching registry: {str(e)}")
diff --git a/src/codegraphcontext/tools/handlers/query_handlers.py b/src/codegraphcontext/tools/handlers/query_handlers.py
diff --git a/src/codegraphcontext/utils/tool_limits.py b/src/codegraphcontext/utils/tool_limits.py