Skip to content

Commit 3e9888f

Browse files
1 parent 536d578 commit 3e9888f

7 files changed

Lines changed: 293 additions & 37 deletions

File tree

src/codegraphcontext/cli/config_manager.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@
4747
"SCIP_INDEXER": "false",
4848
"SCIP_LANGUAGES": "python,typescript,go,rust,java",
4949
"SKIP_EXTERNAL_RESOLUTION": "false",
50+
# 0 = unlimited; any positive integer caps MCP tool response size.
51+
"MAX_TOOL_RESPONSE_TOKENS": "0",
52+
# JSON object mapping tool names to integer result-count limits.
53+
# Example: {"find_code": 20, "analyze_code_relationships": 10, "find_dead_code": 30}
54+
"TOOL_RESULT_LIMITS": "{}",
5055
}
5156

5257
# Configuration key descriptions
@@ -74,6 +79,8 @@
7479
"SCIP_INDEXER": "Use SCIP-based indexing for higher accuracy call/inheritance resolution (requires scip-<lang> tools installed)",
7580
"SCIP_LANGUAGES": "Comma-separated languages to index via SCIP when SCIP_INDEXER=true (python,typescript,go,rust,java)",
7681
"SKIP_EXTERNAL_RESOLUTION": "Skip resolution attempts for external library method calls (recommended for enterprise large Java/Spring codebases)",
82+
"MAX_TOOL_RESPONSE_TOKENS": "Maximum tokens per MCP tool response (0 = unlimited). Truncates oversized payloads and appends a notice.",
83+
"TOOL_RESULT_LIMITS": "JSON object mapping tool names to max result counts, e.g. {\"find_code\": 20, \"analyze_code_relationships\": 10}. Missing keys use built-in defaults.",
7784
}
7885

7986
# Valid values for each config key
@@ -342,6 +349,26 @@ def validate_config_value(key: str, value: str) -> tuple[bool, Optional[str]]:
342349
return False, "PARALLEL_WORKERS must be between 1 and 32"
343350
except ValueError:
344351
return False, "PARALLEL_WORKERS must be a number"
352+
353+
if key == "MAX_TOOL_RESPONSE_TOKENS":
354+
try:
355+
limit = int(value)
356+
if limit < 0:
357+
return False, "MAX_TOOL_RESPONSE_TOKENS must be 0 (unlimited) or a positive integer"
358+
except ValueError:
359+
return False, "MAX_TOOL_RESPONSE_TOKENS must be an integer (0 = unlimited)"
360+
361+
if key == "TOOL_RESULT_LIMITS":
362+
import json as _json
363+
try:
364+
parsed = _json.loads(value)
365+
if not isinstance(parsed, dict):
366+
return False, "TOOL_RESULT_LIMITS must be a JSON object, e.g. {\"find_code\": 20}"
367+
for k, v in parsed.items():
368+
if not isinstance(v, int) or v < 1:
369+
return False, f"TOOL_RESULT_LIMITS: value for '{k}' must be a positive integer"
370+
except _json.JSONDecodeError:
371+
return False, "TOOL_RESULT_LIMITS must be valid JSON, e.g. {\"find_code\": 20, \"find_dead_code\": 30}"
345372

346373
if key == "MAX_DEPTH":
347374
if value.lower() != "unlimited":

src/codegraphcontext/server.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,45 @@ def _strip_workspace_prefix(obj):
7575
return obj
7676

7777

78+
# Approximate chars-per-token used for budget conversion.
79+
# GPT-family tokenizers average ~4 chars/token; using 4 is a safe conservative estimate.
80+
_CHARS_PER_TOKEN = 4
81+
82+
83+
def _apply_response_token_limit(tool_name: str, text: str) -> str:
84+
"""Truncate *text* to the configured token budget and append a notice.
85+
86+
Reads ``MAX_TOOL_RESPONSE_TOKENS`` from the CGC config at call time so
87+
that live config changes are respected without a server restart.
88+
Returns *text* unchanged when the limit is 0 (unlimited) or not set.
89+
"""
90+
from .cli.config_manager import get_config_value
91+
92+
raw = get_config_value("MAX_TOOL_RESPONSE_TOKENS") or "0"
93+
try:
94+
max_tokens = int(raw)
95+
except ValueError:
96+
max_tokens = 0
97+
98+
if max_tokens <= 0:
99+
return text # unlimited
100+
101+
max_chars = max_tokens * _CHARS_PER_TOKEN
102+
if len(text) <= max_chars:
103+
return text
104+
105+
notice = (
106+
f"\n\n[CGC] Response truncated: output exceeded the MAX_TOOL_RESPONSE_TOKENS "
107+
f"limit of {max_tokens} tokens (tool: {tool_name}). "
108+
"Increase MAX_TOOL_RESPONSE_TOKENS or narrow your query for full results."
109+
)
110+
# Reserve space for the notice inside the budget
111+
budget = max_chars - len(notice)
112+
if budget < 0:
113+
budget = 0
114+
return text[:budget] + notice
115+
116+
78117

79118
class MCPServer:
80119
"""
@@ -506,9 +545,11 @@ async def run(self):
506545
"error": {"code": -32000, "message": "Tool execution error", "data": result}
507546
}
508547
else:
548+
response_text = json.dumps(result, indent=2)
549+
response_text = _apply_response_token_limit(tool_name, response_text)
509550
response = {
510551
"jsonrpc": "2.0", "id": request_id,
511-
"result": {"content": [{"type": "text", "text": json.dumps(result, indent=2)}]}
552+
"result": {"content": [{"type": "text", "text": response_text}]}
512553
}
513554
elif method == 'notifications/initialized':
514555
# This is a notification, no response needed.

src/codegraphcontext/tools/code_finder.py

Lines changed: 57 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,20 @@ def _levenshtein_distance(a: str, b: str) -> int:
2424
prev = curr
2525
return prev[-1]
2626

27+
28+
def _normalize_identifier(s: str) -> str:
29+
"""Lowercase and strip separator chars so camelCase / snake_case / spaces
30+
all compare on equal footing.
31+
32+
Examples::
33+
34+
_normalize_identifier('myFunction') -> 'myfunction'
35+
_normalize_identifier('my_function') -> 'myfunction'
36+
_normalize_identifier('my function') -> 'myfunction'
37+
_normalize_identifier('MyFunc tion') -> 'myfunction'
38+
"""
39+
return s.lower().replace('_', '').replace(' ', '')
40+
2741
class CodeFinder:
2842
"""Module for finding relevant code snippets and analyzing relationships."""
2943

@@ -64,11 +78,18 @@ def _find_by_name_fuzzy_portable(
6478
edit_distance: int,
6579
repo_path: Optional[str],
6680
) -> List[Dict]:
67-
"""Fuzzy name match for backends without Lucene fuzzy syntax (Kùzu, FalkorDB, …)."""
81+
"""Fuzzy name match for backends without Lucene fuzzy syntax (Kùzu, FalkorDB, …).
82+
83+
Compares both the raw query and its identifier-normalised form against each
84+
candidate name, taking the minimum distance. This lets camelCase queries
85+
match snake_case stored names and vice-versa without inflating the distance.
86+
"""
6887
if not search_term.strip():
6988
return []
7089
where_clause = "WHERE node.path STARTS WITH $repo_path" if repo_path else ""
71-
limit_tail = "" if repo_path else " LIMIT 8000"
90+
# Without a repo filter we must cap the candidate scan. 20 000 is enough to
91+
# cover any realistic single-repo codebase while keeping latency acceptable.
92+
limit_tail = "" if repo_path else " LIMIT 20000"
7293
params: Dict[str, Any] = {}
7394
if repo_path:
7495
params["repo_path"] = repo_path
@@ -81,13 +102,27 @@ def _find_by_name_fuzzy_portable(
81102
"""
82103
with self.driver.session() as session:
83104
rows = session.run(query, **params).data()
84-
q = search_term.lower()
105+
106+
# Two query forms:
107+
# q_raw – lowercased original (e.g. "myFuncton" → "myfuncton")
108+
# q_norm – separator-stripped (e.g. "my_functon" → "myfuncton")
109+
# Using the minimum of both distances avoids the space-inflation bug where
110+
# the handler's replace('_', ' ') turns "my_functon" into "my functon",
111+
# which compares poorly against camelCase stored names.
112+
q_raw = search_term.lower()
113+
q_norm = _normalize_identifier(search_term)
114+
85115
scored: List[tuple[int, Dict]] = []
86116
for row in rows:
87117
nm = row.get("name")
88118
if not isinstance(nm, str):
89119
continue
90-
d = _levenshtein_distance(q, nm.lower())
120+
nm_lower = nm.lower()
121+
nm_norm = _normalize_identifier(nm)
122+
d = min(
123+
_levenshtein_distance(q_raw, nm_lower),
124+
_levenshtein_distance(q_norm, nm_norm),
125+
)
91126
if d <= edit_distance:
92127
scored.append((d, row))
93128
scored.sort(key=lambda x: x[0])
@@ -256,14 +291,24 @@ def find_imports(self, search_term: str) -> List[Dict]:
256291

257292
def find_related_code(self, user_query: str, fuzzy_search: bool, edit_distance: int, repo_path: Optional[str] = None) -> Dict[str, Any]:
258293
"""Find code related to a query using multiple search strategies"""
259-
# Neo4j full-text uses Lucene fuzzy tokens (e.g. name:foo~2). Kùzu/FalkorDB use
260-
# portable Levenshtein over candidate names instead.
261-
lucene_fuzzy_query = (
262-
" ".join(f"{t}~{edit_distance}" for t in user_query.split())
263-
if fuzzy_search and not self._lacks_native_fulltext
264-
else user_query
265-
)
266-
name_lookup_q = lucene_fuzzy_query if (fuzzy_search and not self._lacks_native_fulltext) else user_query
294+
# For Lucene backends: split snake_case/underscore tokens so Lucene sees
295+
# individual words, then append the fuzzy modifier.
296+
# For portable backends: keep user_query verbatim — _find_by_name_fuzzy_portable
297+
# handles normalisation via _normalize_identifier.
298+
if fuzzy_search and not self._lacks_native_fulltext:
299+
lucene_base = user_query.replace("_", " ").strip()
300+
lucene_fuzzy_query = " ".join(f"{t}~{edit_distance}" for t in lucene_base.split())
301+
else:
302+
lucene_fuzzy_query = user_query
303+
304+
# For portable backends, always pass the *original* query to the fuzzy name
305+
# matcher — _find_by_name_fuzzy_portable applies its own normalisation.
306+
# For Lucene-capable backends, use the Lucene fuzzy token form.
307+
if self._lacks_native_fulltext:
308+
name_lookup_q = user_query
309+
else:
310+
name_lookup_q = lucene_fuzzy_query if fuzzy_search else user_query
311+
267312
content_lookup_q = lucene_fuzzy_query if (fuzzy_search and not self._lacks_native_fulltext) else user_query
268313

269314
results: Dict[str, Any] = {

src/codegraphcontext/tools/handlers/analysis_handlers.py

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from typing import Any, Dict
22
from ..code_finder import CodeFinder
33
from ...utils.debug_log import debug_log
4+
from ...utils.tool_limits import get_tool_result_limit
5+
46

57
def find_dead_code(code_finder: CodeFinder, **args) -> Dict[str, Any]:
68
"""Tool to find potentially dead code across the entire project."""
@@ -9,16 +11,25 @@ def find_dead_code(code_finder: CodeFinder, **args) -> Dict[str, Any]:
911
try:
1012
debug_log(f"Finding dead code. repo_path={repo_path}")
1113
results = code_finder.find_dead_code(exclude_decorated_with=exclude_decorated_with, repo_path=repo_path)
12-
14+
15+
limit = get_tool_result_limit("find_dead_code")
16+
unused = results.get("potentially_unused_functions", [])
17+
truncated = False
18+
if limit and len(unused) > limit:
19+
unused = unused[:limit]
20+
truncated = True
21+
1322
return {
1423
"success": True,
1524
"query_type": "dead_code",
16-
"results": results
25+
"results": {**results, "potentially_unused_functions": unused},
26+
**({"result_limit": limit, "truncated": truncated} if truncated else {}),
1727
}
1828
except Exception as e:
1929
debug_log(f"Error finding dead code: {str(e)}")
2030
return {"error": f"Failed to find dead code: {str(e)}"}
2131

32+
2233
def calculate_cyclomatic_complexity(code_finder: CodeFinder, **args) -> Dict[str, Any]:
2334
"""Tool to calculate cyclomatic complexity for a given function."""
2435
function_name = args.get("function_name")
@@ -28,23 +39,24 @@ def calculate_cyclomatic_complexity(code_finder: CodeFinder, **args) -> Dict[str
2839
try:
2940
debug_log(f"Calculating cyclomatic complexity for function: {function_name}, repo_path={repo_path}")
3041
results = code_finder.get_cyclomatic_complexity(function_name, path, repo_path=repo_path)
31-
42+
3243
response = {
3344
"success": True,
3445
"function_name": function_name,
3546
"results": results
3647
}
3748
if path:
3849
response["path"] = path
39-
50+
4051
return response
4152
except Exception as e:
4253
debug_log(f"Error calculating cyclomatic complexity: {str(e)}")
4354
return {"error": f"Failed to calculate cyclomatic complexity: {str(e)}"}
4455

56+
4557
def find_most_complex_functions(code_finder: CodeFinder, **args) -> Dict[str, Any]:
4658
"""Tool to find the most complex functions."""
47-
limit = args.get("limit", 10)
59+
limit = get_tool_result_limit("find_most_complex_functions", default=args.get("limit", 10))
4860
repo_path = args.get("repo_path")
4961
try:
5062
debug_log(f"Finding the top {limit} most complex functions. repo_path={repo_path}")
@@ -58,6 +70,7 @@ def find_most_complex_functions(code_finder: CodeFinder, **args) -> Dict[str, An
5870
debug_log(f"Error finding most complex functions: {str(e)}")
5971
return {"error": f"Failed to find most complex functions: {str(e)}"}
6072

73+
6174
def analyze_code_relationships(code_finder: CodeFinder, **args) -> Dict[str, Any]:
6275
"""Tool to analyze code relationships"""
6376
query_type = args.get("query_type")
@@ -74,41 +87,65 @@ def analyze_code_relationships(code_finder: CodeFinder, **args) -> Dict[str, Any
7487
"module_deps", "variable_scope", "find_complexity", "find_functions_by_argument", "find_functions_by_decorator"
7588
]
7689
}
77-
90+
7891
try:
7992
debug_log(f"Analyzing relationships: {query_type} for {target}, repo_path={repo_path}")
8093
results = code_finder.analyze_code_relationships(query_type, target, context, repo_path=repo_path)
81-
82-
return {
94+
95+
# Apply per-query-type limit (falls back to tool-level limit)
96+
limit = get_tool_result_limit(query_type) or get_tool_result_limit("analyze_code_relationships")
97+
truncated = False
98+
if limit and isinstance(results, list) and len(results) > limit:
99+
results = results[:limit]
100+
truncated = True
101+
102+
response = {
83103
"success": True, "query_type": query_type, "target": target,
84-
"context": context, "results": results
104+
"context": context, "results": results,
85105
}
86-
106+
if truncated:
107+
response["result_limit"] = limit
108+
response["truncated"] = True
109+
return response
110+
87111
except Exception as e:
88112
debug_log(f"Error analyzing relationships: {str(e)}")
89113
return {"error": f"Failed to analyze relationships: {str(e)}"}
90114

115+
91116
def find_code(code_finder: CodeFinder, **args) -> Dict[str, Any]:
92117
"""Tool to find relevant code snippets"""
93118
query = args.get("query")
94119
DEFAULT_EDIT_DISTANCE = 2
95120
DEFAULT_FUZZY_SEARCH = False
96-
121+
97122
fuzzy_search = args.get("fuzzy_search", DEFAULT_FUZZY_SEARCH)
98123
edit_distance = args.get("edit_distance", DEFAULT_EDIT_DISTANCE)
99124
repo_path = args.get("repo_path")
100125

101126
if fuzzy_search:
102-
# Preserve case for Lucene / Levenshtein name matching; lowercasing breaks
103-
# camelCase fuzzy hits (see GH #758).
104-
query = query.replace("_", " ").strip()
105-
127+
# For Lucene backends the replace('_', ' ') improves token splitting.
128+
# For portable (Kùzu/FalkorDB) backends _find_by_name_fuzzy_portable
129+
# handles normalisation internally, so we leave the query as-is here.
130+
pass # transformation deferred to find_related_code / _find_by_name_fuzzy_portable
131+
106132
try:
107133
debug_log(f"Finding code for query: {query} with fuzzy_search={fuzzy_search}, edit_distance={edit_distance}, repo_path={repo_path}")
108134
results = code_finder.find_related_code(query, fuzzy_search, edit_distance, repo_path=repo_path)
109135

110-
return {"success": True, "query": query, "results": results}
111-
136+
limit = get_tool_result_limit("find_code")
137+
ranked = results.get("ranked_results", [])
138+
truncated = False
139+
if limit and len(ranked) > limit:
140+
ranked = ranked[:limit]
141+
truncated = True
142+
143+
response = {"success": True, "query": query, "results": {**results, "ranked_results": ranked}}
144+
if truncated:
145+
response["result_limit"] = limit
146+
response["truncated"] = True
147+
return response
148+
112149
except Exception as e:
113150
debug_log(f"Error finding code: {str(e)}")
114151
return {"error": f"Failed to find code: {str(e)}"}

src/codegraphcontext/tools/handlers/management_handlers.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from datetime import datetime
44
from ...core.jobs import JobManager, JobStatus
55
from ...utils.debug_log import debug_log
6+
from ...utils.tool_limits import get_tool_result_limit
67
from ..code_finder import CodeFinder
78
from ..graph_builder import GraphBuilder
89

@@ -241,14 +242,24 @@ def search_registry_bundles(code_finder: CodeFinder, **args) -> Dict[str, Any]:
241242

242243
# Sort by name
243244
bundles.sort(key=lambda b: (b.get('name', ''), b.get('full_name', '')))
244-
245-
return {
245+
246+
limit = get_tool_result_limit("search_registry_bundles")
247+
truncated = False
248+
if limit and len(bundles) > limit:
249+
bundles = bundles[:limit]
250+
truncated = True
251+
252+
response = {
246253
"success": True,
247254
"bundles": bundles,
248255
"total": len(bundles),
249256
"query": query if query else "all",
250-
"unique_only": unique_only
257+
"unique_only": unique_only,
251258
}
259+
if truncated:
260+
response["result_limit"] = limit
261+
response["truncated"] = True
262+
return response
252263

253264
except Exception as e:
254265
debug_log(f"Error searching registry: {str(e)}")

0 commit comments

Comments
 (0)