From e9387e4360fba9cb06888897e526ffa12e4d645d Mon Sep 17 00:00:00 2001 From: Pascal Pothmann Date: Fri, 27 Feb 2026 11:02:03 +0100 Subject: [PATCH 1/9] Add Kotlin support to CodeWiki documentation and analysis tools --- README.md | 4 +- codewiki/cli/utils/repo_validator.py | 2 + codewiki/cli/utils/validation.py | 1 + .../analysis/analysis_service.py | 3 +- .../analysis/call_graph_analyzer.py | 25 + .../dependency_analyzer/analyzers/kotlin.py | 505 ++++++++++++++++++ .../src/be/dependency_analyzer/ast_parser.py | 2 +- .../be/dependency_analyzer/utils/patterns.py | 6 +- codewiki/src/be/prompt_template.py | 2 + pyproject.toml | 1 + requirements.txt | 1 + 11 files changed, 546 insertions(+), 6 deletions(-) create mode 100644 codewiki/src/be/dependency_analyzer/analyzers/kotlin.py diff --git a/README.md b/README.md index ce047407..951812bb 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ codewiki generate --github-pages --create-branch ## What is CodeWiki? -CodeWiki is an open-source framework for **automated repository-level documentation** across seven programming languages. It generates holistic, architecture-aware documentation that captures not only individual functions but also their cross-file, cross-module, and system-level interactions. +CodeWiki is an open-source framework for **automated repository-level documentation** across eight programming languages. It generates holistic, architecture-aware documentation that captures not only individual functions but also their cross-file, cross-module, and system-level interactions. ### Key Innovations @@ -88,7 +88,7 @@ CodeWiki is an open-source framework for **automated repository-level documentat ### Supported Languages -**🐍 Python** • **☕ Java** • **🟨 JavaScript** • **🔷 TypeScript** • **⚙️ C** • **🔧 C++** • **🪟 C#** +**🐍 Python** • **☕ Java** • **🟨 JavaScript** • **🔷 TypeScript** • **⚙️ C** • **🔧 C++** • **🪟 C#** • **🎯 Kotlin** --- diff --git a/codewiki/cli/utils/repo_validator.py b/codewiki/cli/utils/repo_validator.py index 3e17d031..608e6037 100644 --- a/codewiki/cli/utils/repo_validator.py +++ b/codewiki/cli/utils/repo_validator.py @@ -30,6 +30,8 @@ '.php', # PHP '.phtml', # PHP templates '.inc', # PHP includes + '.kt', # Kotlin + '.kts', # Kotlin Scripts } diff --git a/codewiki/cli/utils/validation.py b/codewiki/cli/utils/validation.py index 12cb5454..9711ba33 100644 --- a/codewiki/cli/utils/validation.py +++ b/codewiki/cli/utils/validation.py @@ -172,6 +172,7 @@ def detect_supported_languages(directory: Path) -> List[Tuple[str, int]]: 'C++': ['.cpp', '.hpp', '.cc', '.hh', '.cxx', '.hxx'], 'C#': ['.cs'], 'PHP': ['.php', '.phtml', '.inc'], + 'Kotlin': ['.kt', '.kts'], } # Directories to exclude from counting diff --git a/codewiki/src/be/dependency_analyzer/analysis/analysis_service.py b/codewiki/src/be/dependency_analyzer/analysis/analysis_service.py index aa3ba471..c9cf5bb6 100644 --- a/codewiki/src/be/dependency_analyzer/analysis/analysis_service.py +++ b/codewiki/src/be/dependency_analyzer/analysis/analysis_service.py @@ -310,6 +310,7 @@ def _filter_supported_languages(self, code_files: List[Dict]) -> List[Dict]: "php", "go", "rust", + "kotlin", } return [ @@ -320,7 +321,7 @@ def _filter_supported_languages(self, code_files: List[Dict]) -> List[Dict]: def _get_supported_languages(self) -> List[str]: """Get list of currently supported languages for analysis.""" - return ["python", "javascript", "typescript", "java", "csharp", "c", "cpp", "php"] + return ["python", "javascript", "typescript", "java", "csharp", "c", "cpp", "php", "kotlin"] def _cleanup_repository(self, temp_dir: str): """Clean up cloned repository.""" diff --git a/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py b/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py index 7175cd9b..da825fd4 100644 --- a/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py +++ b/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py @@ -126,6 +126,8 @@ def _analyze_code_file(self, repo_dir: str, file_info: Dict): self._analyze_typescript_file(file_path, content, repo_dir) elif language == "java": self._analyze_java_file(file_path, content, repo_dir) + elif language == "kotlin": + self._analyze_kotlin_file(file_path, content, repo_dir) elif language == "csharp": self._analyze_csharp_file(file_path, content, repo_dir) elif language == "c": @@ -280,6 +282,27 @@ def _analyze_java_file(self, file_path: str, content: str, repo_dir: str): except Exception as e: logger.error(f"Failed to analyze Java file {file_path}: {e}", exc_info=True) + def _analyze_kotlin_file(self, file_path: str, content: str, repo_dir: str): + """ + Analyze Kotlin file using tree-sitter based analyzer. + + Args: + file_path: Relative path to the Kotlin file + content: File content string + repo_dir: Repository base directory + """ + from codewiki.src.be.dependency_analyzer.analyzers.kotlin import analyze_kotlin_file + + try: + functions, relationships = analyze_kotlin_file(file_path, content, repo_path=repo_dir) + for func in functions: + func_id = func.id if func.id else f"{file_path}:{func.name}" + self.functions[func_id] = func + + self.call_relationships.extend(relationships) + except Exception as e: + logger.error(f"Failed to analyze Kotlin file {file_path}: {e}", exc_info=True) + def _analyze_csharp_file(self, file_path: str, content: str, repo_dir: str): """ Analyze C# file using tree-sitter based analyzer. @@ -408,6 +431,8 @@ def _generate_visualization_data(self) -> Dict: node_classes.append("lang-c") elif file_ext in [".cpp", ".cc", ".cxx", ".hpp", ".hxx"]: node_classes.append("lang-cpp") + elif file_ext in [".kt", ".kts"]: + node_classes.append("lang-kotlin") elif file_ext in [".php", ".phtml", ".inc"]: node_classes.append("lang-php") diff --git a/codewiki/src/be/dependency_analyzer/analyzers/kotlin.py b/codewiki/src/be/dependency_analyzer/analyzers/kotlin.py new file mode 100644 index 00000000..d56f220c --- /dev/null +++ b/codewiki/src/be/dependency_analyzer/analyzers/kotlin.py @@ -0,0 +1,505 @@ +import logging +from typing import List, Optional, Tuple +from pathlib import Path +import sys +import os + +from tree_sitter import Parser, Language +import tree_sitter_kotlin +from codewiki.src.be.dependency_analyzer.models.core import Node, CallRelationship + +logger = logging.getLogger(__name__) + +class TreeSitterKotlinAnalyzer: + def __init__(self, file_path: str, content: str, repo_path: Optional[str] = None): + self.file_path = Path(file_path) + self.content = content + self.repo_path = repo_path or "" + self.nodes: List[Node] = [] + self.call_relationships: List[CallRelationship] = [] + self._analyze() + + def _get_module_path(self) -> str: + if self.repo_path: + try: + rel_path = os.path.relpath(str(self.file_path), self.repo_path) + except ValueError: + rel_path = str(self.file_path) + else: + rel_path = str(self.file_path) + + for ext in ['.kt', '.kts']: + if rel_path.endswith(ext): + rel_path = rel_path[:-len(ext)] + break + return rel_path.replace('/', '.').replace('\\', '.') + + def _get_relative_path(self) -> str: + """Get relative path from repo root.""" + if self.repo_path: + try: + return os.path.relpath(str(self.file_path), self.repo_path) + except ValueError: + return str(self.file_path) + else: + return str(self.file_path) + + def _get_component_id(self, name: str, parent_class: Optional[str] = None) -> str: + module_path = self._get_module_path() + if parent_class: + return f"{module_path}.{parent_class}.{name}" + else: + return f"{module_path}.{name}" + + def _analyze(self): + try: + language_capsule = tree_sitter_kotlin.language() + kotlin_language = Language(language_capsule) + parser = Parser(kotlin_language) + tree = parser.parse(bytes(self.content, "utf8")) + root = tree.root_node + lines = self.content.splitlines() + + top_level_nodes = {} + + self._extract_nodes(root, top_level_nodes, lines) + self._extract_relationships(root, top_level_nodes) + except Exception as e: + logger.error(f"Error parsing Kotlin file {self.file_path}: {e}") + + def _extract_nodes(self, node, top_level_nodes, lines): + node_type = None + node_name = None + + if node.type == "class_declaration": + is_interface = any(c.type == "interface" for c in node.children) + + if is_interface: + node_type = "interface" + else: + modifiers = self._get_class_modifiers(node) + if "abstract" in modifiers: + node_type = "abstract class" + elif "data" in modifiers: + node_type = "data class" + elif "enum" in modifiers: + node_type = "enum class" + elif "annotation" in modifiers: + node_type = "annotation class" + else: + node_type = "class" + + name_node = next((c for c in node.children if c.type == "identifier"), None) + node_name = name_node.text.decode() if name_node else None + + elif node.type == "object_declaration": + node_type = "object" + name_node = next((c for c in node.children if c.type == "identifier"), None) + node_name = name_node.text.decode() if name_node else None + + elif node.type == "function_declaration": + name_node = next((c for c in node.children if c.type == "identifier"), None) + if name_node: + method_name = name_node.text.decode() + containing_class = self._find_containing_class_name(node) + if containing_class: + node_type = "method" + node_name = f"{containing_class}.{method_name}" + else: + node_type = "function" + node_name = method_name + + if node_type and node_name: + component_id = self._get_component_id(node_name) + relative_path = self._get_relative_path() + + # Extract docstring if present + docstring = "" + if node.prev_sibling and hasattr(node.prev_sibling, "type"): + if node.prev_sibling.type in ("line_comment", "block_comment"): + docstring = node.prev_sibling.text.decode().strip() + + # Safely extract code lines + start_line_idx = node.start_point[0] + end_line_idx = node.end_point[0] + 1 + code_snippet = "\n".join(lines[start_line_idx:end_line_idx]) if start_line_idx < len(lines) else "" + + node_obj = Node( + id=component_id, + name=node_name, + component_type=node_type, + file_path=str(self.file_path), + relative_path=relative_path, + source_code=code_snippet, + start_line=node.start_point[0]+1, + end_line=node.end_point[0]+1, + has_docstring=bool(docstring), + docstring=docstring, + parameters=None, + node_type=node_type, + base_classes=None, + class_name=None, + display_name=f"{node_type} {node_name}", + component_id=component_id + ) + self.nodes.append(node_obj) + top_level_nodes[node_name] = node_obj + + for child in node.children: + self._extract_nodes(child, top_level_nodes, lines) + + def _get_class_modifiers(self, class_node) -> set: + """Extract class modifiers (abstract, data, enum, annotation, etc.).""" + modifiers = set() + modifiers_node = next((c for c in class_node.children if c.type == "modifiers"), None) + if modifiers_node: + for mod in modifiers_node.children: + if mod.type in ("class_modifier", "inheritance_modifier", "visibility_modifier"): + for inner in mod.children: + modifiers.add(inner.type) + return modifiers + + def _extract_relationships(self, node, top_level_nodes): + # 1. Inheritance and Interface Implementation via delegation_specifiers + if node.type == "class_declaration": + class_name = self._get_identifier_name(node) + delegation_specifiers = next( + (c for c in node.children if c.type == "delegation_specifiers"), None + ) + if delegation_specifiers and class_name: + for spec in delegation_specifiers.children: + if spec.type == "delegation_specifier": + for child in spec.children: + type_name = None + if child.type == "constructor_invocation": + user_type = next( + (c for c in child.children if c.type == "user_type"), None + ) + if user_type: + type_name = self._get_type_name(user_type) + elif child.type == "user_type": + type_name = self._get_type_name(child) + + if type_name and not self._is_primitive_type(type_name): + caller_id = self._get_component_id(class_name) + callee_id = self._get_component_id(type_name) + self.call_relationships.append(CallRelationship( + caller=caller_id, + callee=callee_id, + call_line=node.start_point[0]+1, + is_resolved=False + )) + + # 2. Property Type Use (field types) + if node.type == "property_declaration": + containing_class = self._find_containing_class(node, top_level_nodes) + var_decl = next((c for c in node.children if c.type == "variable_declaration"), None) + if containing_class and var_decl: + type_node = next( + (c for c in var_decl.children if c.type == "user_type"), None + ) + if type_node: + prop_type_name = self._get_type_name(type_node) + if prop_type_name and not self._is_primitive_type(prop_type_name): + self.call_relationships.append(CallRelationship( + caller=containing_class, + callee=prop_type_name, + call_line=node.start_point[0]+1, + is_resolved=False + )) + + # 3. Constructor parameter type use + if node.type == "class_parameter": + containing_class_node = node.parent + while containing_class_node and containing_class_node.type != "class_declaration": + containing_class_node = containing_class_node.parent + if containing_class_node: + class_name = self._get_identifier_name(containing_class_node) + if class_name and class_name in top_level_nodes: + type_node = next( + (c for c in node.children if c.type == "user_type"), None + ) + if type_node: + param_type = self._get_type_name(type_node) + if param_type and not self._is_primitive_type(param_type): + caller_id = self._get_component_id(class_name) + self.call_relationships.append(CallRelationship( + caller=caller_id, + callee=param_type, + call_line=node.start_point[0]+1, + is_resolved=False + )) + + # 4. Method Calls / Function invocations + if node.type == "call_expression": + caller_id = self._find_containing_method(node) or self._find_containing_class(node, top_level_nodes) + + target_expr = next( + (c for c in node.children if c.type in ["identifier", "navigation_expression"]), None + ) + + if target_expr and caller_id: + if target_expr.type == "identifier": + callee_name = target_expr.text.decode() + if callee_name and callee_name[0].isupper() and not self._is_primitive_type(callee_name): + callee_id = self._get_component_id(callee_name) + self.call_relationships.append(CallRelationship( + caller=caller_id, + callee=callee_id, + call_line=node.start_point[0]+1, + is_resolved=False + )) + elif callee_name and not self._is_primitive_type(callee_name): + self.call_relationships.append(CallRelationship( + caller=caller_id, + callee=callee_name, + call_line=node.start_point[0]+1, + is_resolved=False + )) + + elif target_expr.type == "navigation_expression": + children = list(target_expr.children) + object_node = next( + (c for c in children if c.type == "identifier"), None + ) + method_node = None + identifiers = [c for c in children if c.type == "identifier"] + if len(identifiers) >= 2: + object_node = identifiers[0] + method_node = identifiers[-1] + elif len(identifiers) == 1: + method_node = identifiers[0] + nav_child = next( + (c for c in children if c.type == "navigation_expression"), None + ) + if nav_child: + object_node = self._get_root_identifier(nav_child) + else: + object_node = None + + if object_node and method_node: + object_name = object_node.text.decode() if hasattr(object_node, 'text') else str(object_node) + method_name = method_node.text.decode() + + target_type = None + if object_name in top_level_nodes: + target_type = object_name + else: + target_type = self._find_variable_type(node, object_name, top_level_nodes) + + if target_type and not self._is_primitive_type(target_type): + callee_id = self._get_component_id(target_type) + self.call_relationships.append(CallRelationship( + caller=caller_id, + callee=callee_id, + call_line=node.start_point[0]+1, + is_resolved=False + )) + elif method_node and not object_node: + callee_name = method_node.text.decode() + self.call_relationships.append(CallRelationship( + caller=caller_id, + callee=callee_name, + call_line=node.start_point[0]+1, + is_resolved=False + )) + + for child in node.children: + self._extract_relationships(child, top_level_nodes) + + def _is_primitive_type(self, type_name: str) -> bool: + """Check if type is a Kotlin primitive or common built-in type.""" + primitives = { + "Boolean", "Byte", "Char", "Double", "Float", "Int", "Long", "Short", + "String", "Unit", "Nothing", "Any", + "List", "Set", "Map", "Collection", "Iterable", "Sequence", + "MutableList", "MutableSet", "MutableMap", "MutableCollection", + "Array", "IntArray", "LongArray", "FloatArray", "DoubleArray", + "BooleanArray", "ByteArray", "CharArray", "ShortArray", + "Pair", "Triple", + } + return type_name in primitives + + def _get_identifier_name(self, node): + """Get identifier name from a node.""" + name_node = next((c for c in node.children if c.type == "identifier"), None) + return name_node.text.decode() if name_node else None + + def _get_type_name(self, node) -> Optional[str]: + """Get the primary type name from a type node, stripping generics.""" + if node.type == "user_type": + id_node = next((c for c in node.children if c.type == "identifier"), None) + return id_node.text.decode() if id_node else None + elif node.type == "nullable_type": + inner = next((c for c in node.children if c.type == "user_type"), None) + if inner: + return self._get_type_name(inner) + elif node.type == "identifier": + return node.text.decode() + return None + + def _get_root_identifier(self, nav_node): + """Get the root identifier from a chain of navigation_expressions.""" + first_child = nav_node.children[0] if nav_node.children else None + if first_child: + if first_child.type == "identifier": + return first_child + elif first_child.type == "navigation_expression": + return self._get_root_identifier(first_child) + return None + + def _find_containing_class_name(self, node): + """Walk up to find the containing class/object/interface name.""" + current = node.parent + while current: + if current.type in ("class_declaration", "object_declaration"): + name_node = next((c for c in current.children if c.type == "identifier"), None) + if name_node: + return name_node.text.decode() + current = current.parent + return None + + def _find_containing_class(self, node, top_level_nodes): + """Find the component ID of the containing class.""" + class_name = self._find_containing_class_name(node) + if class_name and class_name in top_level_nodes: + return self._get_component_id(class_name) + return None + + def _find_containing_method(self, node): + """Find the component ID of the containing function/method.""" + current = node.parent + while current: + if current.type == "function_declaration": + method_name = self._get_identifier_name(current) + class_name = self._find_containing_class_name(current) + if method_name: + if class_name: + return self._get_component_id(f"{class_name}.{method_name}") + return self._get_component_id(method_name) + current = current.parent + return None + + def _find_variable_type(self, node, variable_name: str, top_level_nodes) -> Optional[str]: + """ + Try to resolve the type of a variable by searching local declarations, + function parameters, constructor parameters, and class properties. + """ + func_node = node.parent + while func_node and func_node.type != "function_declaration": + func_node = func_node.parent + + if func_node: + params_node = next( + (c for c in func_node.children if c.type == "function_value_parameters"), None + ) + if params_node: + for param in params_node.children: + if param.type == "parameter": + param_name_node = next( + (c for c in param.children if c.type == "identifier"), None + ) + if param_name_node and param_name_node.text.decode() == variable_name: + type_node = next( + (c for c in param.children if c.type in ("user_type", "nullable_type")), None + ) + if type_node: + return self._get_type_name(type_node) + + body_node = next( + (c for c in func_node.children if c.type == "function_body"), None + ) + if body_node: + block = next((c for c in body_node.children if c.type == "block"), None) + if block: + result = self._search_variable_declaration(block, variable_name) + if result: + return result + + class_node = node.parent + while class_node and class_node.type not in ("class_declaration", "object_declaration"): + class_node = class_node.parent + + if class_node: + primary_ctor = next( + (c for c in class_node.children if c.type == "primary_constructor"), None + ) + if primary_ctor: + class_params = next( + (c for c in primary_ctor.children if c.type == "class_parameters"), None + ) + if class_params: + for param in class_params.children: + if param.type == "class_parameter": + param_name = next( + (c for c in param.children if c.type == "identifier"), None + ) + if param_name and param_name.text.decode() == variable_name: + type_node = next( + (c for c in param.children if c.type in ("user_type", "nullable_type")), None + ) + if type_node: + return self._get_type_name(type_node) + + class_body = next( + (c for c in class_node.children if c.type in ("class_body", "enum_class_body")), None + ) + if class_body: + for body_child in class_body.children: + if body_child.type == "property_declaration": + var_decl = next( + (c for c in body_child.children if c.type == "variable_declaration"), None + ) + if var_decl: + prop_name = next( + (c for c in var_decl.children if c.type == "identifier"), None + ) + if prop_name and prop_name.text.decode() == variable_name: + type_node = next( + (c for c in var_decl.children if c.type in ("user_type", "nullable_type")), None + ) + if type_node: + return self._get_type_name(type_node) + + return None + + def _search_variable_declaration(self, block_node, variable_name: str) -> Optional[str]: + """Search for a local variable declaration with explicit type annotation in a block.""" + for child in block_node.children: + if child.type == "property_declaration": + var_decl = next( + (c for c in child.children if c.type == "variable_declaration"), None + ) + if var_decl: + name_node = next( + (c for c in var_decl.children if c.type == "identifier"), None + ) + if name_node and name_node.text.decode() == variable_name: + type_node = next( + (c for c in var_decl.children if c.type in ("user_type", "nullable_type")), None + ) + if type_node: + return self._get_type_name(type_node) + + init_expr = next( + (c for c in child.children if c.type == "call_expression"), None + ) + if init_expr: + call_id = next( + (c for c in init_expr.children if c.type == "identifier"), None + ) + if call_id: + inferred = call_id.text.decode() + if inferred and inferred[0].isupper(): + return inferred + + elif child.type == "block": + result = self._search_variable_declaration(child, variable_name) + if result: + return result + + return None + +def analyze_kotlin_file(file_path: str, content: str, repo_path: Optional[str] = None) -> Tuple[List[Node], List[CallRelationship]]: + analyzer = TreeSitterKotlinAnalyzer(file_path, content, repo_path) + return analyzer.nodes, analyzer.call_relationships diff --git a/codewiki/src/be/dependency_analyzer/ast_parser.py b/codewiki/src/be/dependency_analyzer/ast_parser.py index 3323ed7a..81ac0bdc 100644 --- a/codewiki/src/be/dependency_analyzer/ast_parser.py +++ b/codewiki/src/be/dependency_analyzer/ast_parser.py @@ -135,7 +135,7 @@ def _determine_component_type(self, func_dict: Dict) -> str: def _file_to_module_path(self, file_path: str) -> str: path = file_path - extensions = ['.py', '.js', '.ts', '.java', '.cs', '.cpp', '.hpp', '.h', '.c', '.tsx', '.jsx', '.cc', '.mjs', '.cxx', '.cc', '.cjs'] + extensions = ['.py', '.js', '.ts', '.java', '.cs', '.cpp', '.hpp', '.h', '.c', '.tsx', '.jsx', '.cc', '.mjs', '.cxx', '.cc', '.cjs', '.kt', '.kts'] for ext in extensions: if path.endswith(ext): path = path[:-len(ext)] diff --git a/codewiki/src/be/dependency_analyzer/utils/patterns.py b/codewiki/src/be/dependency_analyzer/utils/patterns.py index 9fb003f7..1680ed4f 100644 --- a/codewiki/src/be/dependency_analyzer/utils/patterns.py +++ b/codewiki/src/be/dependency_analyzer/utils/patterns.py @@ -5,7 +5,7 @@ and function definitions across multiple programming languages. """ -from typing import List, Dict +from typing import List, Dict, Optional DEFAULT_IGNORE_PATTERNS = { ".github", @@ -156,6 +156,7 @@ "*.rb", "*.swift", "*.kt", + "*.kts", "*.scala", "*.clj", "*.hs", @@ -407,6 +408,7 @@ "c": ["void {name}", "int {name}", "{name}("], "cpp": ["void {name}", "int {name}", "{name}("], "php": ["function {name}", "public function {name}", "private function {name}", "protected function {name}"], + "kotlin": ["fun {name}", "private fun {name}", "public fun {name}", "internal fun {name}", "protected fun {name}"], "general": ["{name}("], # Fallback pattern } @@ -533,7 +535,7 @@ def has_high_connectivity_potential(filename: str, filepath: str) -> bool: return False -def is_critical_function(func_name: str, code_snippet: str = None) -> bool: +def is_critical_function(func_name: str, code_snippet: Optional[str] = None) -> bool: """ Check if a function is critical based on name and code patterns. diff --git a/codewiki/src/be/prompt_template.py b/codewiki/src/be/prompt_template.py index d37d5b61..f6da5f8b 100644 --- a/codewiki/src/be/prompt_template.py +++ b/codewiki/src/be/prompt_template.py @@ -235,6 +235,8 @@ ".cjs": "javascript", ".jsx": "javascript", ".cs": "csharp", + ".kt": "kotlin", + ".kts": "kotlin", ".php": "php", ".phtml": "php", ".inc": "php" diff --git a/pyproject.toml b/pyproject.toml index 00c3e01d..8360f6e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "tree-sitter-cpp>=0.23.4", "tree-sitter-c-sharp>=0.23.1", "tree-sitter-php>=0.23.0", + "tree-sitter-kotlin>=1.1.0", "openai>=1.107.0", "litellm>=1.77.0", "pydantic>=2.11.7", diff --git a/requirements.txt b/requirements.txt index bed6e91a..e2dce481 100644 --- a/requirements.txt +++ b/requirements.txt @@ -147,6 +147,7 @@ tree-sitter-cpp==0.23.4 tree-sitter-embedded-template==0.23.2 tree-sitter-java==0.23.5 tree-sitter-javascript==0.21.4 +tree-sitter-kotlin==1.1.0 tree-sitter-language-pack==0.8.0 tree-sitter-python==0.23.6 tree-sitter-typescript==0.21.2 From 28b15fbae02d4a7fe0e8dcea40d25af01cf869ee Mon Sep 17 00:00:00 2001 From: dalyzhou Date: Sun, 15 Mar 2026 21:31:35 +0800 Subject: [PATCH 2/9] fix: add missing runtime dependencies in pyproject.toml Several packages (colorama, fastapi, uvicorn, python-multipart, logfire) are imported in the source code but not declared in pyproject.toml, causing ModuleNotFoundError when installed via pip install. --- pyproject.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8360f6e2..c6eb08a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,12 @@ dependencies = [ "psutil>=7.0.0", "PyYAML>=6.0.2", "mermaid-parser-py>=0.0.2", - "mermaid-py>=0.8.0" + "mermaid-py>=0.8.0", + "fastapi>=0.116.0", + "uvicorn>=0.35.0", + "python-multipart>=0.0.20", + "colorama>=0.4.6", + "logfire>=4.1.0" ] [external] From 8ac96473c7d15a86269fd2c22e1daca02f0b9fab Mon Sep 17 00:00:00 2001 From: dalyzhou Date: Sun, 15 Mar 2026 22:11:28 +0800 Subject: [PATCH 3/9] fix: exclude node_modules from dependency analysis and add progress logging node_modules was missing from DEFAULT_IGNORE_PATTERNS, causing the dependency analyzer to parse all files in node_modules (225k+ files instead of ~600). Also added per-file progress logging and timeout protection to improve observability during long analysis runs. --- .../analysis/call_graph_analyzer.py | 118 +++++++++++++----- .../be/dependency_analyzer/utils/patterns.py | 7 +- 2 files changed, 91 insertions(+), 34 deletions(-) diff --git a/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py b/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py index da825fd4..272ca0b6 100644 --- a/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py +++ b/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py @@ -9,7 +9,10 @@ from typing import Dict, List import logging import traceback +import time +import signal from pathlib import Path +from contextlib import contextmanager from codewiki.src.be.dependency_analyzer.models.core import Node, CallRelationship from codewiki.src.be.dependency_analyzer.utils.patterns import CODE_EXTENSIONS from codewiki.src.be.dependency_analyzer.utils.security import safe_open_text @@ -17,6 +20,33 @@ logger = logging.getLogger(__name__) +class TimeoutError(Exception): + """Raised when file parsing exceeds timeout.""" + pass + + +@contextmanager +def timeout(seconds): + """Context manager for timeout on file parsing.""" + def signal_handler(signum, frame): + raise TimeoutError(f"File parsing exceeded {seconds}s timeout") + + # Only use signal on Unix systems (not Windows) + try: + old_handler = signal.signal(signal.SIGALRM, signal_handler) + signal.alarm(seconds) + yield + except AttributeError: + # Windows doesn't support SIGALRM, skip timeout + yield + finally: + try: + signal.alarm(0) + signal.signal(signal.SIGALRM, old_handler) + except (AttributeError, ValueError): + pass + + class CallGraphAnalyzer: def __init__(self): """Initialize the call graph analyzer.""" @@ -35,17 +65,35 @@ def analyze_code_files(self, code_files: List[Dict], base_dir: str) -> Dict: 4. Returns all nodes and relationships """ logger.debug(f"Starting analysis of {len(code_files)} files") + logger.info(f"📊 Parsing {len(code_files)} source files (this may take a few minutes)...") self.functions = {} self.call_relationships = [] files_analyzed = 0 - for file_info in code_files: - logger.debug(f"Analyzing: {file_info['path']}") - self._analyze_code_file(base_dir, file_info) - files_analyzed += 1 - logger.debug( - f"Analysis complete: {files_analyzed} files analyzed, {len(self.functions)} functions, {len(self.call_relationships)} relationships" + files_failed = 0 + start_time = time.time() + + for idx, file_info in enumerate(code_files, 1): + file_path = file_info['path'] + try: + # Log progress every file with elapsed time + if idx % max(1, len(code_files) // 10) == 0 or idx <= 5: + elapsed = time.time() - start_time + rate = idx / elapsed if elapsed > 0 else 0 + remaining = (len(code_files) - idx) / rate if rate > 0 else 0 + logger.info(f" [{idx}/{len(code_files)}] {file_path} ({elapsed:.1f}s elapsed, ~{remaining:.1f}s remaining)") + + self._analyze_code_file(base_dir, file_info) + files_analyzed += 1 + except Exception as e: + files_failed += 1 + logger.warning(f" ⚠️ [{idx}/{len(code_files)}] Failed to analyze {file_path}: {str(e)[:100]}") + + elapsed_time = time.time() - start_time + logger.info( + f"✓ Analysis complete: {files_analyzed}/{len(code_files)} files analyzed, " + f"{files_failed} failed, {len(self.functions)} functions, {len(self.call_relationships)} relationships ({elapsed_time:.1f}s)" ) logger.debug("Resolving call relationships") @@ -116,34 +164,38 @@ def _analyze_code_file(self, repo_dir: str, file_info: Dict): file_path = base / file_info["path"] try: - content = safe_open_text(base, file_path) - language = file_info["language"] - if language == "python": - self._analyze_python_file(file_path, content, repo_dir) - elif language == "javascript": - self._analyze_javascript_file(file_path, content, repo_dir) - elif language == "typescript": - self._analyze_typescript_file(file_path, content, repo_dir) - elif language == "java": - self._analyze_java_file(file_path, content, repo_dir) - elif language == "kotlin": - self._analyze_kotlin_file(file_path, content, repo_dir) - elif language == "csharp": - self._analyze_csharp_file(file_path, content, repo_dir) - elif language == "c": - self._analyze_c_file(file_path, content, repo_dir) - elif language == "cpp": - self._analyze_cpp_file(file_path, content, repo_dir) - elif language == "php": - self._analyze_php_file(file_path, content, repo_dir) - # else: - # logger.warning( - # f"Unsupported language for call graph analysis: {language} for file {file_path}" - # ) - + # Add timeout protection (30 seconds per file max) + with timeout(30): + content = safe_open_text(base, file_path) + language = file_info["language"] + if language == "python": + self._analyze_python_file(file_path, content, repo_dir) + elif language == "javascript": + self._analyze_javascript_file(file_path, content, repo_dir) + elif language == "typescript": + self._analyze_typescript_file(file_path, content, repo_dir) + elif language == "java": + self._analyze_java_file(file_path, content, repo_dir) + elif language == "kotlin": + self._analyze_kotlin_file(file_path, content, repo_dir) + elif language == "csharp": + self._analyze_csharp_file(file_path, content, repo_dir) + elif language == "c": + self._analyze_c_file(file_path, content, repo_dir) + elif language == "cpp": + self._analyze_cpp_file(file_path, content, repo_dir) + elif language == "php": + self._analyze_php_file(file_path, content, repo_dir) + # else: + # logger.warning( + # f"Unsupported language for call graph analysis: {language} for file {file_path}" + # ) + + except TimeoutError as e: + logger.warning(f"⏱️ Timeout analyzing {file_path}: {str(e)}") except Exception as e: - logger.error(f"⚠️ Error analyzing {file_path}: {str(e)}") - logger.error(f"Traceback: {traceback.format_exc()}") + logger.debug(f"Error analyzing {file_path}: {str(e)}") + logger.debug(f"Traceback: {traceback.format_exc()}") def _analyze_python_file(self, file_path: str, content: str, base_dir: str): """ diff --git a/codewiki/src/be/dependency_analyzer/utils/patterns.py b/codewiki/src/be/dependency_analyzer/utils/patterns.py index 1680ed4f..36440d3c 100644 --- a/codewiki/src/be/dependency_analyzer/utils/patterns.py +++ b/codewiki/src/be/dependency_analyzer/utils/patterns.py @@ -29,12 +29,17 @@ ".hypothesis", "poetry.lock", "Pipfile.lock", - # JavaScript/FileSystemNode + # JavaScript/Node.js (CRITICAL: node_modules must be excluded) + "node_modules/", + "node_modules", "package-lock.json", "yarn.lock", ".npm", ".yarn", ".pnpm-store", + ".next/", + ".nuxt/", + ".turbo/", "bun.lock", "bun.lockb", # Java From 584805c5cb61869865ec97cb4f0d666b4e296840 Mon Sep 17 00:00:00 2001 From: dalyzhou Date: Sun, 15 Mar 2026 22:50:06 +0800 Subject: [PATCH 4/9] fix: handle non-standard responses from OpenAI-compatible API proxies Some OpenAI-compatible proxies (Azure, vLLM, internal proxies, etc.) return choices[].index as null instead of an integer, causing pydantic validation to fail. Add a CompatibleOpenAIModel subclass that patches these fields before validation. --- codewiki/src/be/llm_services.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/codewiki/src/be/llm_services.py b/codewiki/src/be/llm_services.py index 0de98438..90a693ec 100644 --- a/codewiki/src/be/llm_services.py +++ b/codewiki/src/be/llm_services.py @@ -1,6 +1,11 @@ """ LLM service factory for creating configured LLM clients. + +Includes a compatibility layer for OpenAI-compatible API proxies that may +return slightly non-standard responses (e.g. choices[].index = None). """ +import logging +from openai.types import chat from pydantic_ai.models.openai import OpenAIModel from pydantic_ai.providers.openai import OpenAIProvider from pydantic_ai.models.openai import OpenAIModelSettings @@ -9,10 +14,29 @@ from codewiki.src.config import Config +logger = logging.getLogger(__name__) + + +class CompatibleOpenAIModel(OpenAIModel): + """OpenAIModel subclass that patches non-standard API proxy responses. + + Some OpenAI-compatible proxies return responses with fields like + choices[].index set to None instead of an integer. This subclass + fixes those fields before pydantic validation runs. + """ + + def _validate_completion(self, response: chat.ChatCompletion) -> chat.ChatCompletion: + # Patch choices[].index: None -> sequential integer (0, 1, 2, ...) + if response.choices: + for i, choice in enumerate(response.choices): + if choice.index is None: + choice.index = i + return super()._validate_completion(response) + -def create_main_model(config: Config) -> OpenAIModel: +def create_main_model(config: Config) -> CompatibleOpenAIModel: """Create the main LLM model from configuration.""" - return OpenAIModel( + return CompatibleOpenAIModel( model_name=config.main_model, provider=OpenAIProvider( base_url=config.llm_base_url, @@ -25,9 +49,9 @@ def create_main_model(config: Config) -> OpenAIModel: ) -def create_fallback_model(config: Config) -> OpenAIModel: +def create_fallback_model(config: Config) -> CompatibleOpenAIModel: """Create the fallback LLM model from configuration.""" - return OpenAIModel( + return CompatibleOpenAIModel( model_name=config.fallback_model, provider=OpenAIProvider( base_url=config.llm_base_url, From 36c93c23db8c43ad8de676bc5a8dba276f734e45 Mon Sep 17 00:00:00 2001 From: Nghi Bui Date: Tue, 17 Mar 2026 03:39:02 +0700 Subject: [PATCH 5/9] Fix issues #44, #34, #17, #43: OpenAI compat, Anthropic validation, keyring fallback, verbose logging - #44: Use max_completion_tokens for newer OpenAI models (o1, o3, gpt-4o) that reject the deprecated max_tokens parameter - #34: Detect Anthropic API URLs and use the anthropic SDK for connectivity tests instead of forcing OpenAI client on all providers - #17: Add file-based fallback (credentials.json) when system keyring is unavailable (headless containers, RHEL). Support CODEWIKI_NO_KEYRING=1 env var to force file-based storage - #43: Add file-level and module-level verbose logging during dependency analysis, clustering, and doc generation phases Co-Authored-By: Claude Opus 4.6 (1M context) --- codewiki/cli/adapters/doc_generator.py | 25 ++++- codewiki/cli/commands/config.py | 27 +++++- codewiki/cli/config_manager.py | 121 ++++++++++++++++++------- codewiki/src/be/llm_services.py | 63 ++++++++++--- 4 files changed, 180 insertions(+), 56 deletions(-) diff --git a/codewiki/cli/adapters/doc_generator.py b/codewiki/cli/adapters/doc_generator.py index 826b60ca..78256f2e 100644 --- a/codewiki/cli/adapters/doc_generator.py +++ b/codewiki/cli/adapters/doc_generator.py @@ -186,9 +186,14 @@ async def _run_backend_generation(self, backend_config: BackendConfig): components, leaf_nodes = doc_generator.graph_builder.build_dependency_graph() self.job.statistics.total_files_analyzed = len(components) self.job.statistics.leaf_nodes = len(leaf_nodes) - + if self.verbose: - self.progress_tracker.update_stage(1.0, f"Found {len(leaf_nodes)} leaf nodes") + self.progress_tracker.update_stage(0.8, f"Analyzed {len(components)} files, found {len(leaf_nodes)} leaf nodes") + # Log individual files analyzed + for comp_name in sorted(components.keys())[:20]: + self.progress_tracker.update_stage(0.9, f" File: {comp_name}") + if len(components) > 20: + self.progress_tracker.update_stage(0.9, f" ... and {len(components) - 20} more files") except Exception as e: raise APIError(f"Dependency analysis failed: {e}") @@ -212,15 +217,22 @@ async def _run_backend_generation(self, backend_config: BackendConfig): try: if os.path.exists(first_module_tree_path): module_tree = file_manager.load_json(first_module_tree_path) + if self.verbose: + self.progress_tracker.update_stage(0.5, "Loaded cached module tree") else: + if self.verbose: + self.progress_tracker.update_stage(0.3, f"Clustering {len(leaf_nodes)} leaf nodes with LLM...") module_tree = cluster_modules(leaf_nodes, components, backend_config) file_manager.save_json(module_tree, first_module_tree_path) - + file_manager.save_json(module_tree, module_tree_path) self.job.module_count = len(module_tree) - + if self.verbose: self.progress_tracker.update_stage(1.0, f"Created {len(module_tree)} modules") + for mod_name in sorted(module_tree.keys()): + file_count = len(module_tree[mod_name]) if isinstance(module_tree[mod_name], list) else "?" + self.progress_tracker.update_stage(1.0, f" Module: {mod_name} ({file_count} files)") except Exception as e: raise APIError(f"Module clustering failed: {e}") @@ -232,9 +244,12 @@ async def _run_backend_generation(self, backend_config: BackendConfig): self.progress_tracker.update_stage(0.1, "Generating module documentation...") try: + if self.verbose: + self.progress_tracker.update_stage(0.2, f"Generating documentation for {self.job.module_count} modules...") + # Run the actual documentation generation await doc_generator.generate_module_documentation(components, leaf_nodes) - + if self.verbose: self.progress_tracker.update_stage(0.9, "Creating repository overview...") diff --git a/codewiki/cli/commands/config.py b/codewiki/cli/commands/config.py index f776273a..090b2abc 100644 --- a/codewiki/cli/commands/config.py +++ b/codewiki/cli/commands/config.py @@ -490,13 +490,32 @@ def config_validate(quick: bool, verbose: bool): # Step 5: API connectivity test (unless --quick) if not quick: + if verbose: + click.echo() + click.echo("[5/5] Testing API connectivity...") + click.echo(f" URL: {config.base_url}") + try: - from openai import OpenAI - client = OpenAI(api_key=api_key, base_url=config.base_url) - response = client.models.list() - click.secho("✓ API connectivity test successful", fg="green") + base_url_lower = (config.base_url or "").lower() + if "api.anthropic.com" in base_url_lower: + # Use Anthropic SDK for native Anthropic endpoints + import anthropic + client = anthropic.Anthropic(api_key=api_key) + client.models.list(limit=1) + else: + # Use OpenAI SDK for OpenAI-compatible endpoints + from openai import OpenAI + client = OpenAI(api_key=api_key, base_url=config.base_url) + client.models.list() + + if verbose: + click.secho(" ✓ API responded successfully", fg="green") + else: + click.secho("✓ API connectivity test successful", fg="green") except Exception as e: click.secho("✗ API connectivity test failed", fg="red") + if verbose: + click.echo(f" Error: {e}") sys.exit(EXIT_CONFIG_ERROR) # Success diff --git a/codewiki/cli/config_manager.py b/codewiki/cli/config_manager.py index f1f86b2f..5a5c5e88 100644 --- a/codewiki/cli/config_manager.py +++ b/codewiki/cli/config_manager.py @@ -1,8 +1,14 @@ """ Configuration manager with keyring integration for secure credential storage. + +Supports fallback to file-based storage when system keyring is unavailable +(e.g. headless containers, RHEL without Secret Service). Set the environment +variable CODEWIKI_NO_KEYRING=1 to force file-based storage. """ import json +import os +import logging from pathlib import Path from typing import Optional import keyring @@ -12,6 +18,7 @@ from codewiki.cli.utils.errors import ConfigurationError, FileSystemError from codewiki.cli.utils.fs import ensure_directory, safe_write, safe_read +logger = logging.getLogger(__name__) # Keyring configuration KEYRING_SERVICE = "codewiki" @@ -20,33 +27,63 @@ # Configuration file location CONFIG_DIR = Path.home() / ".codewiki" CONFIG_FILE = CONFIG_DIR / "config.json" +CREDENTIALS_FILE = CONFIG_DIR / "credentials.json" CONFIG_VERSION = "1.0" class ConfigManager: """ Manages CodeWiki configuration with secure keyring storage for API keys. - + Storage: - - API key: System keychain via keyring (macOS Keychain, Windows Credential Manager, + - API key: System keychain via keyring (macOS Keychain, Windows Credential Manager, Linux Secret Service) + - Fallback: ~/.codewiki/credentials.json when keyring is unavailable - Other settings: ~/.codewiki/config.json + + Set CODEWIKI_NO_KEYRING=1 to skip keyring and use file-based storage. """ - + def __init__(self): """Initialize the configuration manager.""" self._api_key: Optional[str] = None self._config: Optional[Configuration] = None + self._force_no_keyring = os.environ.get("CODEWIKI_NO_KEYRING", "").strip() in ("1", "true", "yes") self._keyring_available = self._check_keyring_available() - + def _check_keyring_available(self) -> bool: """Check if system keyring is available.""" + if self._force_no_keyring: + logger.debug("Keyring disabled via CODEWIKI_NO_KEYRING") + return False try: # Try to get/set a test value keyring.get_password(KEYRING_SERVICE, "__test__") return True - except KeyringError: + except (KeyringError, Exception): return False + + def _load_api_key_from_file(self) -> Optional[str]: + """Load API key from fallback credentials file.""" + if not CREDENTIALS_FILE.exists(): + return None + try: + content = safe_read(CREDENTIALS_FILE) + data = json.loads(content) + return data.get("api_key") + except (json.JSONDecodeError, FileSystemError): + return None + + def _save_api_key_to_file(self, api_key: str): + """Save API key to fallback credentials file (plaintext).""" + ensure_directory(CONFIG_DIR) + data = {"api_key": api_key} + safe_write(CREDENTIALS_FILE, json.dumps(data, indent=2)) + # Restrict file permissions (owner read/write only) + try: + CREDENTIALS_FILE.chmod(0o600) + except OSError: + pass def load(self) -> bool: """ @@ -70,12 +107,14 @@ def load(self) -> bool: self._config = Configuration.from_dict(data) - # Load API key from keyring - try: - self._api_key = keyring.get_password(KEYRING_SERVICE, KEYRING_API_KEY_ACCOUNT) - except KeyringError: - # Keyring unavailable, API key will be None - pass + # Load API key from keyring, falling back to file + if self._keyring_available: + try: + self._api_key = keyring.get_password(KEYRING_SERVICE, KEYRING_API_KEY_ACCOUNT) + except (KeyringError, Exception): + pass + if self._api_key is None: + self._api_key = self._load_api_key_from_file() return True except (json.JSONDecodeError, FileSystemError) as e: @@ -154,17 +193,23 @@ def save( if self._config.base_url and self._config.main_model and self._config.cluster_model: self._config.validate() - # Save API key to keyring + # Save API key to keyring, falling back to file if api_key is not None: self._api_key = api_key - try: - keyring.set_password(KEYRING_SERVICE, KEYRING_API_KEY_ACCOUNT, api_key) - except KeyringError as e: - # Fallback: warn about keyring unavailability - raise ConfigurationError( - f"System keychain unavailable: {e}\n" - f"Please ensure your system keychain is properly configured." - ) + if self._keyring_available: + try: + keyring.set_password(KEYRING_SERVICE, KEYRING_API_KEY_ACCOUNT, api_key) + except (KeyringError, Exception): + # Keyring failed at runtime — fall back to file + self._keyring_available = False + self._save_api_key_to_file(api_key) + logger.warning( + "System keychain unavailable. API key stored in %s " + "(plaintext). Set CODEWIKI_NO_KEYRING=1 to suppress this warning.", + CREDENTIALS_FILE + ) + else: + self._save_api_key_to_file(api_key) # Save non-sensitive config to JSON config_data = { @@ -179,17 +224,20 @@ def save( def get_api_key(self) -> Optional[str]: """ - Get API key from keyring. - + Get API key from keyring or fallback file. + Returns: API key or None if not set """ if self._api_key is None: - try: - self._api_key = keyring.get_password(KEYRING_SERVICE, KEYRING_API_KEY_ACCOUNT) - except KeyringError: - pass - + if self._keyring_available: + try: + self._api_key = keyring.get_password(KEYRING_SERVICE, KEYRING_API_KEY_ACCOUNT) + except (KeyringError, Exception): + pass + if self._api_key is None: + self._api_key = self._load_api_key_from_file() + return self._api_key def get_config(self) -> Optional[Configuration]: @@ -219,12 +267,19 @@ def is_configured(self) -> bool: return self._config.is_complete() def delete_api_key(self): - """Delete API key from keyring.""" - try: - keyring.delete_password(KEYRING_SERVICE, KEYRING_API_KEY_ACCOUNT) - self._api_key = None - except KeyringError: - pass + """Delete API key from keyring and fallback file.""" + if self._keyring_available: + try: + keyring.delete_password(KEYRING_SERVICE, KEYRING_API_KEY_ACCOUNT) + except (KeyringError, Exception): + pass + # Also remove fallback credentials file + if CREDENTIALS_FILE.exists(): + try: + CREDENTIALS_FILE.unlink() + except OSError: + pass + self._api_key = None def clear(self): """Clear all configuration (file and keyring).""" diff --git a/codewiki/src/be/llm_services.py b/codewiki/src/be/llm_services.py index 90a693ec..08af907f 100644 --- a/codewiki/src/be/llm_services.py +++ b/codewiki/src/be/llm_services.py @@ -6,6 +6,7 @@ """ import logging from openai.types import chat + from pydantic_ai.models.openai import OpenAIModel from pydantic_ai.providers.openai import OpenAIProvider from pydantic_ai.models.openai import OpenAIModelSettings @@ -17,9 +18,40 @@ logger = logging.getLogger(__name__) +def _should_use_max_completion_tokens(model_name: str, base_url: str) -> bool: + """ + Determine whether to use max_completion_tokens instead of max_tokens. + + Newer OpenAI models (o1, o3, gpt-4o, etc.) require max_completion_tokens. + Anthropic and other providers still use max_tokens. + """ + model_lower = model_name.lower() + # OpenAI models that require max_completion_tokens + new_openai_patterns = ("o1", "o3", "gpt-4o", "gpt-4-turbo") + if any(pattern in model_lower for pattern in new_openai_patterns): + return True + # If base_url points to OpenAI directly, newer models may need it + if base_url and "api.openai.com" in base_url: + return True + return False + + +def _build_model_settings(config: Config, model_name: str) -> OpenAIModelSettings: + """Build model settings with the correct token parameter.""" + if _should_use_max_completion_tokens(model_name, config.llm_base_url): + return OpenAIModelSettings( + temperature=0.0, + max_completion_tokens=config.max_tokens + ) + return OpenAIModelSettings( + temperature=0.0, + max_tokens=config.max_tokens + ) + + class CompatibleOpenAIModel(OpenAIModel): """OpenAIModel subclass that patches non-standard API proxy responses. - + Some OpenAI-compatible proxies return responses with fields like choices[].index set to None instead of an integer. This subclass fixes those fields before pydantic validation runs. @@ -42,10 +74,7 @@ def create_main_model(config: Config) -> CompatibleOpenAIModel: base_url=config.llm_base_url, api_key=config.llm_api_key ), - settings=OpenAIModelSettings( - temperature=0.0, - max_tokens=config.max_tokens - ) + settings=_build_model_settings(config, config.main_model) ) @@ -57,10 +86,7 @@ def create_fallback_model(config: Config) -> CompatibleOpenAIModel: base_url=config.llm_base_url, api_key=config.llm_api_key ), - settings=OpenAIModelSettings( - temperature=0.0, - max_tokens=config.max_tokens - ) + settings=_build_model_settings(config, config.fallback_model) ) @@ -87,24 +113,33 @@ def call_llm( ) -> str: """ Call LLM with the given prompt. - + Args: prompt: The prompt to send config: Configuration containing LLM settings model: Model name (defaults to config.main_model) temperature: Temperature setting - + Returns: LLM response text """ if model is None: model = config.main_model - + client = create_openai_client(config) + + # Use the correct token parameter based on model/provider + token_kwargs = {} + if _should_use_max_completion_tokens(model, config.llm_base_url): + token_kwargs["max_completion_tokens"] = config.max_tokens + logger.debug("Using max_completion_tokens=%d for model %s", config.max_tokens, model) + else: + token_kwargs["max_tokens"] = config.max_tokens + response = client.chat.completions.create( model=model, messages=[{"role": "user", "content": prompt}], temperature=temperature, - max_tokens=config.max_tokens + **token_kwargs ) - return response.choices[0].message.content \ No newline at end of file + return response.choices[0].message.content From 086969a19c8f1d4a338bdea9430a0ff63c2eece9 Mon Sep 17 00:00:00 2001 From: Nghi Bui Date: Tue, 17 Mar 2026 03:51:11 +0700 Subject: [PATCH 6/9] Add AWS Bedrock support (#40), MCP server (#9), and incremental updates (#15) - #40 Bedrock: Add --provider (openai-compatible|anthropic|bedrock) and --aws-region flags. Uses litellm to translate API calls for Bedrock and Anthropic providers. Passes provider/region through full config pipeline. - #9 MCP server: Add `codewiki mcp` command that starts an MCP stdio server exposing three tools: generate_docs, analyze_repo, and get_module_tree. Uses the mcp SDK (already in requirements). Compatible with Claude, Cursor, and other MCP clients. - #15 Incremental updates: Add `codewiki generate --update` flag that detects changed files since last generation (via commit_id in metadata.json + git diff), invalidates affected module docs, and only regenerates what changed. Co-Authored-By: Claude Opus 4.6 (1M context) --- codewiki/cli/adapters/doc_generator.py | 2 + codewiki/cli/commands/config.py | 36 ++- codewiki/cli/commands/generate.py | 157 ++++++++++++- codewiki/cli/config_manager.py | 16 +- codewiki/cli/main.py | 22 ++ codewiki/cli/models/config.py | 12 +- codewiki/mcp/__init__.py | 6 + codewiki/mcp/server.py | 310 +++++++++++++++++++++++++ codewiki/src/be/llm_services.py | 82 +++++++ codewiki/src/config.py | 15 +- 10 files changed, 643 insertions(+), 15 deletions(-) create mode 100644 codewiki/mcp/__init__.py create mode 100644 codewiki/mcp/server.py diff --git a/codewiki/cli/adapters/doc_generator.py b/codewiki/cli/adapters/doc_generator.py index 78256f2e..7551f156 100644 --- a/codewiki/cli/adapters/doc_generator.py +++ b/codewiki/cli/adapters/doc_generator.py @@ -137,6 +137,8 @@ def generate(self) -> DocumentationJob: main_model=self.config.get('main_model'), cluster_model=self.config.get('cluster_model'), fallback_model=self.config.get('fallback_model'), + provider=self.config.get('provider', 'openai-compatible'), + aws_region=self.config.get('aws_region', 'us-east-1'), max_tokens=self.config.get('max_tokens', 32768), max_token_per_module=self.config.get('max_token_per_module', 36369), max_token_per_leaf_module=self.config.get('max_token_per_leaf_module', 16000), diff --git a/codewiki/cli/commands/config.py b/codewiki/cli/commands/config.py index 090b2abc..5ef6d99c 100644 --- a/codewiki/cli/commands/config.py +++ b/codewiki/cli/commands/config.py @@ -83,6 +83,16 @@ def config_group(): type=int, help="Maximum depth for hierarchical decomposition (default: 2)" ) +@click.option( + "--provider", + type=click.Choice(['openai-compatible', 'anthropic', 'bedrock'], case_sensitive=False), + help="LLM provider type (default: openai-compatible)" +) +@click.option( + "--aws-region", + type=str, + help="AWS region for Bedrock provider (default: us-east-1)" +) def config_set( api_key: Optional[str], base_url: Optional[str], @@ -92,7 +102,9 @@ def config_set( max_tokens: Optional[int], max_token_per_module: Optional[int], max_token_per_leaf_module: Optional[int], - max_depth: Optional[int] + max_depth: Optional[int], + provider: Optional[str] = None, + aws_region: Optional[str] = None ): """ Set configuration values for CodeWiki. @@ -127,7 +139,7 @@ def config_set( """ try: # Check if at least one option is provided - if not any([api_key, base_url, main_model, cluster_model, fallback_model, max_tokens, max_token_per_module, max_token_per_leaf_module, max_depth]): + if not any([api_key, base_url, main_model, cluster_model, fallback_model, max_tokens, max_token_per_module, max_token_per_leaf_module, max_depth, provider, aws_region]): click.echo("No options provided. Use --help for usage information.") sys.exit(EXIT_CONFIG_ERROR) @@ -168,11 +180,17 @@ def config_set( if max_depth < 1: raise ConfigurationError("max_depth must be a positive integer") validated_data['max_depth'] = max_depth - + + if provider is not None: + validated_data['provider'] = provider + + if aws_region is not None: + validated_data['aws_region'] = aws_region + # Create config manager and save manager = ConfigManager() manager.load() # Load existing config if present - + manager.save( api_key=validated_data.get('api_key'), base_url=validated_data.get('base_url'), @@ -182,7 +200,9 @@ def config_set( max_tokens=validated_data.get('max_tokens'), max_token_per_module=validated_data.get('max_token_per_module'), max_token_per_leaf_module=validated_data.get('max_token_per_leaf_module'), - max_depth=validated_data.get('max_depth') + max_depth=validated_data.get('max_depth'), + provider=validated_data.get('provider'), + aws_region=validated_data.get('aws_region') ) # Display success messages @@ -230,6 +250,12 @@ def config_set( if max_depth: click.secho(f"✓ Max depth: {max_depth}", fg="green") + + if provider: + click.secho(f"✓ Provider: {provider}", fg="green") + + if aws_region: + click.secho(f"✓ AWS Region: {aws_region}", fg="green") click.echo("\n" + click.style("Configuration updated successfully.", fg="green", bold=True)) diff --git a/codewiki/cli/commands/generate.py b/codewiki/cli/commands/generate.py index 8512f736..b7caea2d 100644 --- a/codewiki/cli/commands/generate.py +++ b/codewiki/cli/commands/generate.py @@ -39,6 +39,139 @@ def parse_patterns(patterns_str: str) -> List[str]: return [p.strip() for p in patterns_str.split(',') if p.strip()] +def _detect_changed_files( + repo_path: Path, + output_dir: Path, + logger, + verbose: bool +) -> Optional[List[str]]: + """ + Detect files changed since the last documentation generation. + + Reads the commit_id from metadata.json and compares with current HEAD + using git diff. Returns list of changed file paths, or None if unable + to determine (e.g., no metadata, not a git repo). + """ + import json + + metadata_path = output_dir / "metadata.json" + if not metadata_path.exists(): + if verbose: + logger.debug("No metadata.json found — cannot detect changes, running full generation.") + return None + + try: + metadata = json.loads(metadata_path.read_text()) + prev_commit = metadata.get("generation_info", {}).get("commit_id") + if not prev_commit: + if verbose: + logger.debug("No commit_id in metadata — running full generation.") + return None + except (json.JSONDecodeError, OSError): + return None + + # Get current HEAD commit + try: + import git + repo = git.Repo(repo_path, search_parent_directories=True) + current_commit = repo.head.commit.hexsha + except Exception: + if verbose: + logger.debug("Cannot access git repo — running full generation.") + return None + + if prev_commit == current_commit: + if verbose: + logger.debug(f"HEAD is still at {current_commit[:8]} — no changes.") + return [] + + # Get changed files between previous and current commit + try: + diff_index = repo.commit(prev_commit).diff(current_commit) + changed = [] + for diff in diff_index: + if diff.a_path: + changed.append(diff.a_path) + if diff.b_path and diff.b_path != diff.a_path: + changed.append(diff.b_path) + + if verbose: + logger.debug(f"Changes between {prev_commit[:8]} and {current_commit[:8]}:") + for f in changed[:10]: + logger.debug(f" {f}") + if len(changed) > 10: + logger.debug(f" ... and {len(changed) - 10} more") + + return changed + except Exception as e: + if verbose: + logger.debug(f"Git diff failed: {e} — running full generation.") + return None + + +def _invalidate_affected_modules( + output_dir: Path, + changed_files: List[str], + logger, + verbose: bool +): + """ + Remove cached module documentation for modules that contain changed files. + + Reads module_tree.json to find which modules contain changed files, + then deletes their .md files so they get regenerated. + """ + import json + + module_tree_path = output_dir / "module_tree.json" + if not module_tree_path.exists(): + return + + try: + module_tree = json.loads(module_tree_path.read_text()) + except (json.JSONDecodeError, OSError): + return + + changed_set = set(changed_files) + modules_to_invalidate = set() + + def _find_affected(tree, parent_names=None): + if parent_names is None: + parent_names = [] + for mod_name, mod_info in tree.items(): + components = mod_info.get("components", []) + # Check if any component path overlaps with changed files + for comp in components: + # Component IDs may be class names, check if they match any changed file path + if any(changed_file in comp or comp in changed_file for changed_file in changed_set): + modules_to_invalidate.add(mod_name) + # Also invalidate parent modules + for parent in parent_names: + modules_to_invalidate.add(parent) + break + + children = mod_info.get("children", {}) + if isinstance(children, dict) and children: + _find_affected(children, parent_names + [mod_name]) + + _find_affected(module_tree) + + # Also remove overview.md since it depends on child docs + if modules_to_invalidate: + modules_to_invalidate.add("overview") + + # Delete affected module docs + for mod_name in modules_to_invalidate: + doc_path = output_dir / f"{mod_name}.md" + if doc_path.exists(): + doc_path.unlink() + if verbose: + logger.debug(f"Invalidated: {doc_path.name}") + + if verbose: + logger.debug(f"Invalidated {len(modules_to_invalidate)} modules for regeneration.") + + @click.command(name="generate") @click.option( "--output", @@ -126,6 +259,11 @@ def parse_patterns(patterns_str: str) -> List[str]: default=None, help="Maximum depth for hierarchical decomposition (overrides config)", ) +@click.option( + "--update", + is_flag=True, + help="Incremental update: only regenerate modules affected by changes since last generation", +) @click.pass_context def generate_command( ctx, @@ -142,7 +280,8 @@ def generate_command( max_tokens: Optional[int], max_token_per_module: Optional[int], max_token_per_leaf_module: Optional[int], - max_depth: Optional[int] + max_depth: Optional[int], + update: bool = False ): """ Generate comprehensive documentation for a code repository. @@ -246,8 +385,20 @@ def generate_command( logger.success(f"Output directory: {output_dir}") + # Incremental update: detect changed files and selectively regenerate + changed_files = None + if update and output_dir.exists(): + changed_files = _detect_changed_files(repo_path, output_dir, logger, verbose) + if changed_files is not None and len(changed_files) == 0: + logger.success("No changes detected since last generation. Documentation is up to date.") + sys.exit(EXIT_SUCCESS) + if changed_files is not None: + logger.info(f" Detected {len(changed_files)} changed files — regenerating affected modules.") + # Remove cached module docs for affected files so they get regenerated + _invalidate_affected_modules(output_dir, changed_files, logger, verbose) + # Check for existing documentation - if output_dir.exists() and list(output_dir.glob("*.md")): + if not update and output_dir.exists() and list(output_dir.glob("*.md")): if not click.confirm( f"\n{output_dir} already contains documentation. Overwrite?", default=True @@ -352,6 +503,8 @@ def generate_command( 'fallback_model': config.fallback_model, 'base_url': config.base_url, 'api_key': api_key, + 'provider': getattr(config, 'provider', 'openai-compatible'), + 'aws_region': getattr(config, 'aws_region', 'us-east-1'), 'agent_instructions': agent_instructions_dict, # Max token settings (runtime overrides take precedence) 'max_tokens': max_tokens if max_tokens is not None else config.max_tokens, diff --git a/codewiki/cli/config_manager.py b/codewiki/cli/config_manager.py index 5a5c5e88..a652e405 100644 --- a/codewiki/cli/config_manager.py +++ b/codewiki/cli/config_manager.py @@ -121,7 +121,7 @@ def load(self) -> bool: raise ConfigurationError(f"Failed to load configuration: {e}") def save( - self, + self, api_key: Optional[str] = None, base_url: Optional[str] = None, main_model: Optional[str] = None, @@ -131,11 +131,13 @@ def save( max_tokens: Optional[int] = None, max_token_per_module: Optional[int] = None, max_token_per_leaf_module: Optional[int] = None, - max_depth: Optional[int] = None + max_depth: Optional[int] = None, + provider: Optional[str] = None, + aws_region: Optional[str] = None ): """ Save configuration to file and keyring. - + Args: api_key: API key (stored in keyring) base_url: LLM API base URL @@ -147,6 +149,8 @@ def save( max_token_per_module: Maximum tokens per module for clustering max_token_per_leaf_module: Maximum tokens per leaf module max_depth: Maximum depth for hierarchical decomposition + provider: LLM provider type (openai-compatible, anthropic, bedrock) + aws_region: AWS region for Bedrock provider """ # Ensure config directory exists try: @@ -188,7 +192,11 @@ def save( self._config.max_token_per_leaf_module = max_token_per_leaf_module if max_depth is not None: self._config.max_depth = max_depth - + if provider is not None: + self._config.provider = provider + if aws_region is not None: + self._config.aws_region = aws_region + # Validate configuration (only if base fields are set) if self._config.base_url and self._config.main_model and self._config.cluster_model: self._config.validate() diff --git a/codewiki/cli/main.py b/codewiki/cli/main.py index 44b7f751..23ebc319 100644 --- a/codewiki/cli/main.py +++ b/codewiki/cli/main.py @@ -39,6 +39,28 @@ def version(): cli.add_command(generate_command, name="generate") +@cli.command(name="mcp") +def mcp_command(): + """Start CodeWiki as an MCP (Model Context Protocol) server. + + Exposes documentation generation tools via MCP stdio transport. + Configure in your MCP client (Claude, Cursor, etc.) as: + + \b + { + "mcpServers": { + "codewiki": { + "command": "codewiki", + "args": ["mcp"] + } + } + } + """ + import asyncio + from codewiki.mcp.server import main as mcp_main + asyncio.run(mcp_main()) + + def main(): """Entry point for the CLI.""" try: diff --git a/codewiki/cli/models/config.py b/codewiki/cli/models/config.py index 585b4272..8bfe9152 100644 --- a/codewiki/cli/models/config.py +++ b/codewiki/cli/models/config.py @@ -106,13 +106,15 @@ def get_prompt_addition(self) -> str: class Configuration: """ CodeWiki configuration data model. - + Attributes: base_url: LLM API base URL main_model: Primary model for documentation generation cluster_model: Model for module clustering fallback_model: Fallback model for documentation generation default_output: Default output directory + provider: LLM provider type (openai-compatible, anthropic, bedrock) + aws_region: AWS region for Bedrock provider max_tokens: Maximum tokens for LLM response (default: 32768) max_token_per_module: Maximum tokens per module for clustering (default: 36369) max_token_per_leaf_module: Maximum tokens per leaf module (default: 16000) @@ -124,6 +126,8 @@ class Configuration: cluster_model: str fallback_model: str = "glm-4p5" default_output: str = "docs" + provider: str = "openai-compatible" + aws_region: str = "us-east-1" max_tokens: int = 32768 max_token_per_module: int = 36369 max_token_per_leaf_module: int = 16000 @@ -149,6 +153,8 @@ def to_dict(self) -> dict: 'main_model': self.main_model, 'cluster_model': self.cluster_model, 'default_output': self.default_output, + 'provider': self.provider, + 'aws_region': self.aws_region, 'max_tokens': self.max_tokens, 'max_token_per_module': self.max_token_per_module, 'max_token_per_leaf_module': self.max_token_per_leaf_module, @@ -179,6 +185,8 @@ def from_dict(cls, data: dict) -> 'Configuration': cluster_model=data.get('cluster_model', ''), fallback_model=data.get('fallback_model', 'glm-4p5'), default_output=data.get('default_output', 'docs'), + provider=data.get('provider', 'openai-compatible'), + aws_region=data.get('aws_region', 'us-east-1'), max_tokens=data.get('max_tokens', 32768), max_token_per_module=data.get('max_token_per_module', 36369), max_token_per_leaf_module=data.get('max_token_per_leaf_module', 16000), @@ -233,6 +241,8 @@ def to_backend_config(self, repo_path: str, output_dir: str, api_key: str, runti main_model=self.main_model, cluster_model=self.cluster_model, fallback_model=self.fallback_model, + provider=self.provider, + aws_region=self.aws_region, max_tokens=self.max_tokens, max_token_per_module=self.max_token_per_module, max_token_per_leaf_module=self.max_token_per_leaf_module, diff --git a/codewiki/mcp/__init__.py b/codewiki/mcp/__init__.py new file mode 100644 index 00000000..27c9847d --- /dev/null +++ b/codewiki/mcp/__init__.py @@ -0,0 +1,6 @@ +""" +CodeWiki MCP (Model Context Protocol) server. + +Exposes CodeWiki documentation generation capabilities as MCP tools +that can be called by Claude, Cursor, and other MCP-compatible clients. +""" diff --git a/codewiki/mcp/server.py b/codewiki/mcp/server.py new file mode 100644 index 00000000..600da22c --- /dev/null +++ b/codewiki/mcp/server.py @@ -0,0 +1,310 @@ +""" +CodeWiki MCP Server. + +Exposes documentation generation as MCP tools: + - generate_docs: Generate full documentation for a repository + - analyze_repo: Analyze repository structure and dependencies + - get_module_tree: Get the module clustering for a repository + +Usage: + # Run as standalone MCP server (stdio transport) + python -m codewiki.mcp.server + + # Or register in your MCP client config: + { + "mcpServers": { + "codewiki": { + "command": "python", + "args": ["-m", "codewiki.mcp.server"] + } + } + } +""" + +import asyncio +import json +import logging +import os +from pathlib import Path +from typing import Any + +from mcp.server import Server +from mcp.server.stdio import stdio_server +from mcp.types import ( + TextContent, + Tool, +) + +logger = logging.getLogger(__name__) + +# Create the MCP server +server = Server("codewiki") + + +def _load_config(): + """Load CodeWiki configuration from ~/.codewiki/config.json + keyring.""" + from codewiki.cli.config_manager import ConfigManager + manager = ConfigManager() + if not manager.load(): + raise RuntimeError( + "CodeWiki not configured. Run 'codewiki config set' first." + ) + return manager + + +@server.list_tools() +async def list_tools() -> list[Tool]: + """List available CodeWiki MCP tools.""" + return [ + Tool( + name="generate_docs", + description=( + "Generate comprehensive AI-powered documentation for a code repository. " + "Analyzes dependencies, clusters modules, and generates markdown documentation." + ), + inputSchema={ + "type": "object", + "properties": { + "repo_path": { + "type": "string", + "description": "Absolute path to the repository to document", + }, + "output_dir": { + "type": "string", + "description": "Output directory for generated docs (default: ./docs)", + "default": "docs", + }, + "doc_type": { + "type": "string", + "enum": ["api", "architecture", "user-guide", "developer"], + "description": "Type of documentation to generate", + }, + "include_patterns": { + "type": "string", + "description": "Comma-separated file patterns to include (e.g., '*.py,*.js')", + }, + "exclude_patterns": { + "type": "string", + "description": "Comma-separated patterns to exclude (e.g., '*test*,*spec*')", + }, + }, + "required": ["repo_path"], + }, + ), + Tool( + name="analyze_repo", + description=( + "Analyze a repository's structure, dependencies, and component hierarchy " + "without generating full documentation. Returns file counts, languages, " + "and dependency information." + ), + inputSchema={ + "type": "object", + "properties": { + "repo_path": { + "type": "string", + "description": "Absolute path to the repository to analyze", + }, + }, + "required": ["repo_path"], + }, + ), + Tool( + name="get_module_tree", + description=( + "Get the module clustering tree for a repository. " + "Shows how source files are grouped into logical modules." + ), + inputSchema={ + "type": "object", + "properties": { + "repo_path": { + "type": "string", + "description": "Absolute path to the repository", + }, + "output_dir": { + "type": "string", + "description": "Directory containing generated docs (default: ./docs)", + "default": "docs", + }, + }, + "required": ["repo_path"], + }, + ), + ] + + +@server.call_tool() +async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]: + """Handle MCP tool calls.""" + try: + if name == "generate_docs": + return await _handle_generate_docs(arguments) + elif name == "analyze_repo": + return await _handle_analyze_repo(arguments) + elif name == "get_module_tree": + return await _handle_get_module_tree(arguments) + else: + return [TextContent(type="text", text=f"Unknown tool: {name}")] + except Exception as e: + logger.error("Tool %s failed: %s", name, e, exc_info=True) + return [TextContent(type="text", text=f"Error: {e}")] + + +async def _handle_generate_docs(arguments: dict[str, Any]) -> list[TextContent]: + """Handle generate_docs tool call.""" + repo_path = Path(arguments["repo_path"]).expanduser().resolve() + output_dir = Path(arguments.get("output_dir", "docs")).expanduser().resolve() + + if not repo_path.exists(): + return [TextContent(type="text", text=f"Repository not found: {repo_path}")] + + # Load config + manager = _load_config() + config = manager.get_config() + api_key = manager.get_api_key() + + if not api_key: + return [TextContent(type="text", text="API key not configured. Run 'codewiki config set --api-key '")] + + # Build agent instructions from arguments + agent_instructions = {} + if arguments.get("doc_type"): + agent_instructions["doc_type"] = arguments["doc_type"] + if arguments.get("include_patterns"): + agent_instructions["include_patterns"] = [p.strip() for p in arguments["include_patterns"].split(",")] + if arguments.get("exclude_patterns"): + agent_instructions["exclude_patterns"] = [p.strip() for p in arguments["exclude_patterns"].split(",")] + + from codewiki.src.config import Config as BackendConfig, set_cli_context + set_cli_context(True) + + backend_config = BackendConfig.from_cli( + repo_path=str(repo_path), + output_dir=str(output_dir), + llm_base_url=config.base_url, + llm_api_key=api_key, + main_model=config.main_model, + cluster_model=config.cluster_model, + fallback_model=config.fallback_model, + provider=getattr(config, "provider", "openai-compatible"), + aws_region=getattr(config, "aws_region", "us-east-1"), + max_tokens=config.max_tokens, + agent_instructions=agent_instructions or None, + ) + + from codewiki.src.be.documentation_generator import DocumentationGenerator + doc_gen = DocumentationGenerator(backend_config) + + # Run generation + await doc_gen.run() + + # Collect results + generated_files = [] + for f in output_dir.iterdir(): + if f.suffix in (".md", ".json", ".html"): + generated_files.append(f.name) + + result = { + "status": "success", + "output_dir": str(output_dir), + "files_generated": sorted(generated_files), + "file_count": len(generated_files), + } + return [TextContent(type="text", text=json.dumps(result, indent=2))] + + +async def _handle_analyze_repo(arguments: dict[str, Any]) -> list[TextContent]: + """Handle analyze_repo tool call — lightweight dependency analysis only.""" + repo_path = Path(arguments["repo_path"]).expanduser().resolve() + + if not repo_path.exists(): + return [TextContent(type="text", text=f"Repository not found: {repo_path}")] + + manager = _load_config() + config = manager.get_config() + api_key = manager.get_api_key() + + from codewiki.src.config import Config as BackendConfig, set_cli_context + set_cli_context(True) + + # Create a minimal backend config (no LLM calls needed for analysis) + backend_config = BackendConfig.from_cli( + repo_path=str(repo_path), + output_dir=str(repo_path / ".codewiki_temp"), + llm_base_url=config.base_url or "http://localhost", + llm_api_key=api_key or "not-needed", + main_model=config.main_model or "unused", + cluster_model=config.cluster_model or "unused", + fallback_model=config.fallback_model or "unused", + ) + + from codewiki.src.be.dependency_analyzer import DependencyGraphBuilder + graph_builder = DependencyGraphBuilder(backend_config) + components, leaf_nodes = graph_builder.build_dependency_graph() + + # Aggregate statistics + languages = {} + files = set() + for comp in components.values(): + lang = getattr(comp, "language", "unknown") + languages[lang] = languages.get(lang, 0) + 1 + files.add(getattr(comp, "relative_path", "")) + + result = { + "status": "success", + "repo_path": str(repo_path), + "total_components": len(components), + "total_files": len(files), + "leaf_nodes": len(leaf_nodes), + "languages": languages, + "sample_components": sorted(list(components.keys()))[:20], + } + return [TextContent(type="text", text=json.dumps(result, indent=2))] + + +async def _handle_get_module_tree(arguments: dict[str, Any]) -> list[TextContent]: + """Handle get_module_tree tool call — returns existing module tree.""" + repo_path = Path(arguments["repo_path"]).expanduser().resolve() + output_dir = Path(arguments.get("output_dir", "docs")).expanduser().resolve() + + module_tree_path = output_dir / "module_tree.json" + if not module_tree_path.exists(): + return [TextContent( + type="text", + text=f"Module tree not found at {module_tree_path}. Run 'codewiki generate' first." + )] + + module_tree = json.loads(module_tree_path.read_text()) + + def _summarize_tree(tree, depth=0): + """Create a readable summary of the module tree.""" + lines = [] + for name, info in tree.items(): + indent = " " * depth + comp_count = len(info.get("components", [])) + children = info.get("children", {}) + child_count = len(children) if isinstance(children, dict) else 0 + lines.append(f"{indent}- {name} ({comp_count} components, {child_count} children)") + if isinstance(children, dict) and children: + lines.extend(_summarize_tree(children, depth + 1)) + return lines + + summary = "\n".join(_summarize_tree(module_tree)) + result = { + "status": "success", + "module_tree_path": str(module_tree_path), + "total_modules": len(module_tree), + "tree_summary": summary, + } + return [TextContent(type="text", text=json.dumps(result, indent=2))] + + +async def main(): + """Run the MCP server with stdio transport.""" + async with stdio_server() as (read_stream, write_stream): + await server.run(read_stream, write_stream, server.create_initialization_options()) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/codewiki/src/be/llm_services.py b/codewiki/src/be/llm_services.py index 08af907f..0b03a54c 100644 --- a/codewiki/src/be/llm_services.py +++ b/codewiki/src/be/llm_services.py @@ -3,6 +3,8 @@ Includes a compatibility layer for OpenAI-compatible API proxies that may return slightly non-standard responses (e.g. choices[].index = None). + +Supports multiple providers: openai-compatible, anthropic, bedrock. """ import logging from openai.types import chat @@ -49,6 +51,22 @@ def _build_model_settings(config: Config, model_name: str) -> OpenAIModelSetting ) +def _get_litellm_model_name(model_name: str, provider: str) -> str: + """ + Get the litellm-compatible model name for a given provider. + + For Bedrock, prefixes the model name with 'bedrock/' if not already prefixed. + For Anthropic, prefixes with 'anthropic/' if not already prefixed. + """ + if provider == "bedrock": + if not model_name.startswith("bedrock/"): + return f"bedrock/{model_name}" + elif provider == "anthropic": + if not model_name.startswith("anthropic/"): + return f"anthropic/{model_name}" + return model_name + + class CompatibleOpenAIModel(OpenAIModel): """OpenAIModel subclass that patches non-standard API proxy responses. @@ -66,6 +84,28 @@ def _validate_completion(self, response: chat.ChatCompletion) -> chat.ChatComple return super()._validate_completion(response) +def _create_litellm_openai_client(config: Config) -> OpenAI: + """ + Create an OpenAI-compatible client backed by litellm's proxy. + + litellm translates OpenAI API calls to Bedrock, Anthropic, etc. + """ + import litellm + # Configure litellm for the provider + if config.provider == "bedrock": + import os + os.environ.setdefault("AWS_DEFAULT_REGION", config.aws_region) + os.environ.setdefault("AWS_REGION_NAME", config.aws_region) + + # litellm exposes an OpenAI-compatible Router we can use, + # but the simplest path is to use litellm.completion() directly. + # For pydantic-ai integration, we create a proxy client. + return OpenAI( + api_key=config.llm_api_key or "not-needed-for-bedrock", + base_url=config.llm_base_url or "https://api.openai.com/v1", + ) + + def create_main_model(config: Config) -> CompatibleOpenAIModel: """Create the main LLM model from configuration.""" return CompatibleOpenAIModel( @@ -114,6 +154,9 @@ def call_llm( """ Call LLM with the given prompt. + Supports openai-compatible, anthropic, and bedrock providers. + For bedrock/anthropic, uses litellm to translate the API call. + Args: prompt: The prompt to send config: Configuration containing LLM settings @@ -126,6 +169,12 @@ def call_llm( if model is None: model = config.main_model + provider = getattr(config, "provider", "openai-compatible") + + if provider in ("bedrock", "anthropic"): + return _call_llm_via_litellm(prompt, config, model, temperature) + + # Default: OpenAI-compatible client = create_openai_client(config) # Use the correct token parameter based on model/provider @@ -143,3 +192,36 @@ def call_llm( **token_kwargs ) return response.choices[0].message.content + + +def _call_llm_via_litellm( + prompt: str, + config: Config, + model: str, + temperature: float = 0.0 +) -> str: + """ + Call LLM via litellm for Bedrock/Anthropic providers. + + litellm handles the provider-specific API translation automatically. + """ + import litellm + import os + + litellm_model = _get_litellm_model_name(model, config.provider) + + if config.provider == "bedrock": + os.environ.setdefault("AWS_DEFAULT_REGION", config.aws_region) + os.environ.setdefault("AWS_REGION_NAME", config.aws_region) + logger.debug("Calling Bedrock model %s in region %s", litellm_model, config.aws_region) + elif config.provider == "anthropic": + logger.debug("Calling Anthropic model %s via litellm", litellm_model) + + response = litellm.completion( + model=litellm_model, + messages=[{"role": "user", "content": prompt}], + temperature=temperature, + max_tokens=config.max_tokens, + api_key=config.llm_api_key if config.provider != "bedrock" else None, + ) + return response.choices[0].message.content diff --git a/codewiki/src/config.py b/codewiki/src/config.py index 420d1ea5..42757788 100644 --- a/codewiki/src/config.py +++ b/codewiki/src/config.py @@ -57,6 +57,9 @@ class Config: main_model: str cluster_model: str fallback_model: str = FALLBACK_MODEL_1 + # Provider configuration + provider: str = "openai-compatible" # openai-compatible, anthropic, bedrock + aws_region: str = "us-east-1" # Max token settings max_tokens: int = DEFAULT_MAX_TOKENS max_token_per_module: int = DEFAULT_MAX_TOKEN_PER_MODULE @@ -155,6 +158,8 @@ def from_cli( main_model: str, cluster_model: str, fallback_model: str = FALLBACK_MODEL_1, + provider: str = "openai-compatible", + aws_region: str = "us-east-1", max_tokens: int = DEFAULT_MAX_TOKENS, max_token_per_module: int = DEFAULT_MAX_TOKEN_PER_MODULE, max_token_per_leaf_module: int = DEFAULT_MAX_TOKEN_PER_LEAF_MODULE, @@ -163,7 +168,7 @@ def from_cli( ) -> 'Config': """ Create configuration for CLI context. - + Args: repo_path: Repository path output_dir: Output directory for generated docs @@ -172,18 +177,20 @@ def from_cli( main_model: Primary model cluster_model: Clustering model fallback_model: Fallback model + provider: LLM provider type (openai-compatible, anthropic, bedrock) + aws_region: AWS region for Bedrock provider max_tokens: Maximum tokens for LLM response max_token_per_module: Maximum tokens per module for clustering max_token_per_leaf_module: Maximum tokens per leaf module max_depth: Maximum depth for hierarchical decomposition agent_instructions: Custom agent instructions dict - + Returns: Config instance """ repo_name = os.path.basename(os.path.normpath(repo_path)) base_output_dir = os.path.join(output_dir, "temp") - + return cls( repo_path=repo_path, output_dir=base_output_dir, @@ -195,6 +202,8 @@ def from_cli( main_model=main_model, cluster_model=cluster_model, fallback_model=fallback_model, + provider=provider, + aws_region=aws_region, max_tokens=max_tokens, max_token_per_module=max_token_per_module, max_token_per_leaf_module=max_token_per_leaf_module, From f3fe1c669e27c2c6daaf839070165be421eaae86 Mon Sep 17 00:00:00 2001 From: anhnh2002 Date: Tue, 24 Mar 2026 17:03:24 +0700 Subject: [PATCH 7/9] re-format component id --- .../generate_sub_module_documentations.py | 15 +++++--- .../be/agent_tools/read_code_components.py | 2 +- .../analysis/call_graph_analyzer.py | 13 +++++-- .../src/be/dependency_analyzer/analyzers/c.py | 4 +-- .../be/dependency_analyzer/analyzers/cpp.py | 6 ++-- .../dependency_analyzer/analyzers/csharp.py | 4 +-- .../be/dependency_analyzer/analyzers/java.py | 6 ++-- .../analyzers/javascript.py | 36 +++++++++---------- .../dependency_analyzer/analyzers/kotlin.py | 6 ++-- .../be/dependency_analyzer/analyzers/php.py | 17 +++------ .../dependency_analyzer/analyzers/python.py | 26 +++++++------- .../analyzers/typescript.py | 11 +++--- .../src/be/dependency_analyzer/ast_parser.py | 8 +++-- codewiki/src/be/documentation_generator.py | 3 ++ codewiki/src/be/prompt_template.py | 36 ++++++++++++++++--- 15 files changed, 117 insertions(+), 76 deletions(-) diff --git a/codewiki/src/be/agent_tools/generate_sub_module_documentations.py b/codewiki/src/be/agent_tools/generate_sub_module_documentations.py index a40b3f42..46580a20 100644 --- a/codewiki/src/be/agent_tools/generate_sub_module_documentations.py +++ b/codewiki/src/be/agent_tools/generate_sub_module_documentations.py @@ -1,4 +1,5 @@ from pydantic_ai import RunContext, Tool, Agent +from typing import Dict, List from codewiki.src.be.agent_tools.deps import CodeWikiDeps from codewiki.src.be.agent_tools.read_code_components import read_code_components_tool @@ -15,12 +16,14 @@ async def generate_sub_module_documentation( ctx: RunContext[CodeWikiDeps], - sub_module_specs: dict[str, list[str]] + sub_module_specs: Dict[str, List[str]] ) -> str: - """Generate detailed description of a given sub-module specs to the sub-agents + """Delegate documentation generation of sub-modules to sub-agents. Each sub-module will be documented separately. Args: - sub_module_specs: The specs of the sub-modules to generate documentation for. E.g. {"sub_module_1": ["core_component_1.1", "core_component_1.2"], "sub_module_2": ["core_component_2.1", "core_component_2.2"], ...} + sub_module_specs: A dictionary mapping sub-module names to their core component IDs. + Example: {"authentication": ["auth_handler.py::AuthHandler", "auth_middleware.py::verify_token"], "database": ["db_client.py::DBClient", "models.py::UserModel"]} + Each key is a descriptive sub-module name, and the value is a list of component IDs from the current module's core components that belong to that sub-module. """ deps = ctx.deps @@ -89,4 +92,8 @@ async def generate_sub_module_documentation( return f"Generate successfully. Documentations: {', '.join([key + '.md' for key in sub_module_specs.keys()])} are saved in the working directory." -generate_sub_module_documentation_tool = Tool(function=generate_sub_module_documentation, name="generate_sub_module_documentation", description="Generate detailed description of a given sub-module specs to the sub-agents", takes_ctx=True) \ No newline at end of file +generate_sub_module_documentation_tool = Tool( + function=generate_sub_module_documentation, + name="generate_sub_module_documentation", + takes_ctx=True +) \ No newline at end of file diff --git a/codewiki/src/be/agent_tools/read_code_components.py b/codewiki/src/be/agent_tools/read_code_components.py index 0125cbb2..93c13446 100644 --- a/codewiki/src/be/agent_tools/read_code_components.py +++ b/codewiki/src/be/agent_tools/read_code_components.py @@ -6,7 +6,7 @@ async def read_code_components(ctx: RunContext[CodeWikiDeps], component_ids: lis """Read the code of a given component id Args: - component_ids: The ids of the components to read, e.g. ["sweagent.types.AgentRunResult", "sweagent.types.AgentRunResult"] where sweagent.types part is the path to the component and AgentRunResult is the name of the component + component_ids: The ids of the components to read, e.g. ["sweagent/types.py::AgentRunResult", "auth/middleware.py::verify_token"] where the part before :: is the file path and the part after :: is the component name """ results = [] diff --git a/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py b/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py index 272ca0b6..8df9e02e 100644 --- a/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py +++ b/codewiki/src/be/dependency_analyzer/analysis/call_graph_analyzer.py @@ -412,7 +412,11 @@ def _resolve_call_relationships(self): func_lookup[func_info.name] = func_id if func_info.component_id: func_lookup[func_info.component_id] = func_id - method_name = func_info.component_id.split(".")[-1] + # Extract short name: handle both new (path::Name) and legacy (path.Name) formats + if "::" in func_info.component_id: + method_name = func_info.component_id.split("::")[-1] + else: + method_name = func_info.component_id.split(".")[-1] if method_name not in func_lookup: func_lookup[method_name] = func_id @@ -424,13 +428,16 @@ def _resolve_call_relationships(self): relationship.callee = func_lookup[callee_name] relationship.is_resolved = True resolved_count += 1 - elif "." in callee_name: + elif "::" in callee_name or "." in callee_name: if callee_name in func_lookup: relationship.callee = func_lookup[callee_name] relationship.is_resolved = True resolved_count += 1 else: - method_name = callee_name.split(".")[-1] + if "::" in callee_name: + method_name = callee_name.split("::")[-1] + else: + method_name = callee_name.split(".")[-1] if method_name in func_lookup: relationship.callee = func_lookup[method_name] relationship.is_resolved = True diff --git a/codewiki/src/be/dependency_analyzer/analyzers/c.py b/codewiki/src/be/dependency_analyzer/analyzers/c.py index 9332a6f8..1a491961 100644 --- a/codewiki/src/be/dependency_analyzer/analyzers/c.py +++ b/codewiki/src/be/dependency_analyzer/analyzers/c.py @@ -44,8 +44,8 @@ def _get_relative_path(self) -> str: return str(self.file_path) def _get_component_id(self, name: str) -> str: - module_path = self._get_module_path() - return f"{module_path}.{name}" if module_path else name + rel_path = self._get_relative_path() + return f"{rel_path}::{name}" def _analyze(self): language_capsule = tree_sitter_c.language() diff --git a/codewiki/src/be/dependency_analyzer/analyzers/cpp.py b/codewiki/src/be/dependency_analyzer/analyzers/cpp.py index dd89d1b3..bb98a9e1 100644 --- a/codewiki/src/be/dependency_analyzer/analyzers/cpp.py +++ b/codewiki/src/be/dependency_analyzer/analyzers/cpp.py @@ -44,10 +44,10 @@ def _get_relative_path(self) -> str: return str(self.file_path) def _get_component_id(self, name: str, parent_class: str = None) -> str: - module_path = self._get_module_path() + rel_path = self._get_relative_path() if parent_class: - return f"{module_path}.{parent_class}.{name}" if module_path else f"{parent_class}.{name}" - return f"{module_path}.{name}" if module_path else name + return f"{rel_path}::{parent_class}.{name}" + return f"{rel_path}::{name}" def _analyze(self): language_capsule = tree_sitter_cpp.language() diff --git a/codewiki/src/be/dependency_analyzer/analyzers/csharp.py b/codewiki/src/be/dependency_analyzer/analyzers/csharp.py index 50500aa4..636e2063 100644 --- a/codewiki/src/be/dependency_analyzer/analyzers/csharp.py +++ b/codewiki/src/be/dependency_analyzer/analyzers/csharp.py @@ -44,8 +44,8 @@ def _get_relative_path(self) -> str: return str(self.file_path) def _get_component_id(self, name: str) -> str: - module_path = self._get_module_path() - return f"{module_path}.{name}" if module_path else name + rel_path = self._get_relative_path() + return f"{rel_path}::{name}" def _analyze(self): language_capsule = tree_sitter_c_sharp.language() diff --git a/codewiki/src/be/dependency_analyzer/analyzers/java.py b/codewiki/src/be/dependency_analyzer/analyzers/java.py index 26f586a1..71065645 100644 --- a/codewiki/src/be/dependency_analyzer/analyzers/java.py +++ b/codewiki/src/be/dependency_analyzer/analyzers/java.py @@ -45,11 +45,11 @@ def _get_relative_path(self) -> str: return str(self.file_path) def _get_component_id(self, name: str, parent_class: str = None) -> str: - module_path = self._get_module_path() + rel_path = self._get_relative_path() if parent_class: - return f"{module_path}.{parent_class}.{name}" + return f"{rel_path}::{parent_class}.{name}" else: - return f"{module_path}.{name}" + return f"{rel_path}::{name}" def _analyze(self): language_capsule = tree_sitter_java.language() diff --git a/codewiki/src/be/dependency_analyzer/analyzers/javascript.py b/codewiki/src/be/dependency_analyzer/analyzers/javascript.py index 7b94e167..a1312695 100644 --- a/codewiki/src/be/dependency_analyzer/analyzers/javascript.py +++ b/codewiki/src/be/dependency_analyzer/analyzers/javascript.py @@ -94,14 +94,14 @@ def _get_relative_path(self) -> str: return str(self.file_path) def _get_component_id(self, name: str, class_name: str = None, is_method: bool = False) -> str: - module_path = self._get_module_path() - + relative_path = self._get_relative_path() + if is_method and class_name: - return f"{module_path}.{class_name}.{name}" - elif class_name and not is_method: - return f"{module_path}.{name}" - else: - return f"{module_path}.{name}" + return f"{relative_path}::{class_name}.{name}" + elif class_name and not is_method: + return f"{relative_path}::{name}" + else: + return f"{relative_path}::{name}" def _find_containing_class(self, node) -> Optional[str]: parent = node.parent @@ -167,7 +167,7 @@ def _extract_methods_from_class(self, class_node, class_name: str) -> None: if child.type == "method_definition": method_name = self._get_method_name(child) if method_name: - method_key = f"{self._get_module_path()}.{class_name}.{method_name}" + method_key = f"{self._get_relative_path()}::{class_name}.{method_name}" method_node = self._create_method_node(child, method_name, class_name) if method_node: self.top_level_nodes[method_key] = method_node @@ -175,7 +175,7 @@ def _extract_methods_from_class(self, class_node, class_name: str) -> None: # Handle arrow function properties field_name = self._get_field_name(child) if field_name and self._is_arrow_function_field(child): - method_key = f"{self._get_module_path()}.{class_name}.{field_name}" + method_key = f"{self._get_relative_path()}::{class_name}.{field_name}" method_node = self._create_method_node(child, field_name, class_name) if method_node: self.top_level_nodes[method_key] = method_node @@ -435,7 +435,7 @@ def _traverse_for_calls(self, node, current_top_level) -> None: if child.type in ["identifier", "type_identifier"]: base_class = self._get_node_text(child) caller_id = self._get_component_id(current_top_level) - callee_id = f"{self._get_module_path()}.{base_class}" + callee_id = f"{self._get_relative_path()}::{base_class}" inheritance_rel = CallRelationship( caller=caller_id, callee=callee_id, @@ -476,8 +476,8 @@ def _traverse_for_calls(self, node, current_top_level) -> None: callee_name = self._extract_callee_name(node) if callee_name: call_info = CallRelationship( - caller=f"{self._get_module_path()}.{current_top_level}", - callee=f"{self._get_module_path()}.{callee_name}", + caller=f"{self._get_relative_path()}::{current_top_level}", + callee=f"{self._get_relative_path()}::{callee_name}", call_line=node.start_point[0] + 1, is_resolved=False ) @@ -498,8 +498,8 @@ def _extract_call_from_node(self, node, caller_name: str) -> Optional[CallRelati call_text = self._get_node_text(node) is_method_call = "this." in call_text or "super." in call_text - caller_id = f"{self._get_module_path()}.{caller_name}" - + caller_id = f"{self._get_relative_path()}::{caller_name}" + if is_method_call: current_class = None for node_key, node_obj in self.top_level_nodes.items(): @@ -508,11 +508,11 @@ def _extract_call_from_node(self, node, caller_name: str) -> Optional[CallRelati break if current_class: - method_key = f"{self._get_module_path()}.{current_class}.{callee_name}" + method_key = f"{self._get_relative_path()}::{current_class}.{callee_name}" if method_key in self.top_level_nodes: return None - callee_id = f"{self._get_module_path()}.{callee_name}" + callee_id = f"{self._get_relative_path()}::{callee_name}" if callee_name in self.top_level_nodes: return CallRelationship( caller=caller_id, @@ -570,8 +570,8 @@ def _parse_jsdoc_types(self, comment_text: str, caller_name: str, line_number: i for base_type in base_types: if base_type and not self._is_builtin_type_js(base_type): - caller_id = f"{self._get_module_path()}.{caller_name}" - callee_id = f"{self._get_module_path()}.{base_type}" + caller_id = f"{self._get_relative_path()}::{caller_name}" + callee_id = f"{self._get_relative_path()}::{base_type}" type_rel = CallRelationship( caller=caller_id, diff --git a/codewiki/src/be/dependency_analyzer/analyzers/kotlin.py b/codewiki/src/be/dependency_analyzer/analyzers/kotlin.py index d56f220c..7ef5be9a 100644 --- a/codewiki/src/be/dependency_analyzer/analyzers/kotlin.py +++ b/codewiki/src/be/dependency_analyzer/analyzers/kotlin.py @@ -45,11 +45,11 @@ def _get_relative_path(self) -> str: return str(self.file_path) def _get_component_id(self, name: str, parent_class: Optional[str] = None) -> str: - module_path = self._get_module_path() + rel_path = self._get_relative_path() if parent_class: - return f"{module_path}.{parent_class}.{name}" + return f"{rel_path}::{parent_class}.{name}" else: - return f"{module_path}.{name}" + return f"{rel_path}::{name}" def _analyze(self): try: diff --git a/codewiki/src/be/dependency_analyzer/analyzers/php.py b/codewiki/src/be/dependency_analyzer/analyzers/php.py index 2029ec82..8a5696fd 100644 --- a/codewiki/src/be/dependency_analyzer/analyzers/php.py +++ b/codewiki/src/be/dependency_analyzer/analyzers/php.py @@ -148,17 +148,10 @@ def _get_relative_path(self) -> str: def _get_component_id(self, name: str, parent_class: str = None) -> str: """Generate component ID for a node.""" - # Use namespace if available - if self.namespace_resolver.current_namespace: - ns_prefix = self.namespace_resolver.current_namespace.replace("\\", ".") - if parent_class: - return f"{ns_prefix}.{parent_class}.{name}" - return f"{ns_prefix}.{name}" - - module_path = self._get_module_path() + rel_path = self._get_relative_path() if parent_class: - return f"{module_path}.{parent_class}.{name}" - return f"{module_path}.{name}" + return f"{rel_path}::{parent_class}.{name}" + return f"{rel_path}::{name}" def _analyze(self): """Parse and analyze the PHP file.""" @@ -442,7 +435,7 @@ def _add_use_relationships(self, node): if name_node: fqn = name_node.text.decode().replace("\\", ".") # Add relationship from file to imported class - file_id = self._get_module_path() + file_id = self._get_relative_path() self.call_relationships.append(CallRelationship( caller=file_id, callee=fqn, @@ -458,7 +451,7 @@ def _add_use_relationships(self, node): name_node = self._find_child_by_type(group_child, "namespace_name") if name_node: fqn = f"{prefix}\\{name_node.text.decode()}" if prefix else name_node.text.decode() - file_id = self._get_module_path() + file_id = self._get_relative_path() self.call_relationships.append(CallRelationship( caller=file_id, callee=fqn.replace("\\", "."), diff --git a/codewiki/src/be/dependency_analyzer/analyzers/python.py b/codewiki/src/be/dependency_analyzer/analyzers/python.py index deda7935..e865729e 100644 --- a/codewiki/src/be/dependency_analyzer/analyzers/python.py +++ b/codewiki/src/be/dependency_analyzer/analyzers/python.py @@ -53,12 +53,12 @@ def _get_module_path(self) -> str: return str(self.file_path).replace('/', '.').replace('\\', '.') def _get_component_id(self, name: str) -> str: - """Generate dot-separated component ID.""" - module_path = self._get_module_path() + """Generate component ID in relative_path::name format.""" + rel_path = self._get_relative_path() if self.current_class_name: - return f"{module_path}.{self.current_class_name}.{name}" + return f"{rel_path}::{self.current_class_name}.{name}" else: - return f"{module_path}.{name}" + return f"{rel_path}::{name}" def generic_visit(self, node): """Override generic_visit to continue AST traversal.""" @@ -70,9 +70,9 @@ def visit_ClassDef(self, node: ast.ClassDef): base_classes = [self._extract_base_class_name(base) for base in node.bases] base_classes = [name for name in base_classes if name is not None] - component_id = f"{self._get_module_path()}.{node.name}" + component_id = f"{self._get_relative_path()}::{node.name}" relative_path = self._get_relative_path() - + class_node = Node( id=component_id, name=node.name, @@ -98,7 +98,7 @@ def visit_ClassDef(self, node: ast.ClassDef): if base_name in self.top_level_nodes: self.call_relationships.append(CallRelationship( caller=component_id, - callee=f"{self._get_module_path()}.{base_name}", + callee=f"{self._get_relative_path()}::{base_name}", call_line=node.lineno, is_resolved=True )) @@ -126,9 +126,9 @@ def _process_function_node(self, node: ast.FunctionDef | ast.AsyncFunctionDef): """Process function definition - only add to nodes if it's top-level.""" if not self.current_class_name: - component_id = f"{self._get_module_path()}.{node.name}" + component_id = f"{self._get_relative_path()}::{node.name}" relative_path = self._get_relative_path() - + func_node = Node( id=component_id, name=node.name, @@ -175,12 +175,12 @@ def visit_Call(self, node: ast.Call): call_name = self._get_call_name(node.func) if call_name: if self.current_class_name: - caller_id = f"{self._get_module_path()}.{self.current_class_name}" + caller_id = f"{self._get_relative_path()}::{self.current_class_name}" else: - caller_id = f"{self._get_module_path()}.{self.current_function_name}" - + caller_id = f"{self._get_relative_path()}::{self.current_function_name}" + if call_name in self.top_level_nodes: - callee_id = f"{self._get_module_path()}.{call_name}" + callee_id = f"{self._get_relative_path()}::{call_name}" else: callee_id = call_name diff --git a/codewiki/src/be/dependency_analyzer/analyzers/typescript.py b/codewiki/src/be/dependency_analyzer/analyzers/typescript.py index 68abc86d..0119bc9c 100644 --- a/codewiki/src/be/dependency_analyzer/analyzers/typescript.py +++ b/codewiki/src/be/dependency_analyzer/analyzers/typescript.py @@ -608,8 +608,8 @@ def _extract_parameter_dependencies(self, formal_params, caller_name: str) -> No if type_id: dependency_name = self._get_node_text(type_id) if dependency_name and dependency_name != caller_name: - caller_id = f"{self._get_module_path()}.{caller_name}" - callee_id = f"{self._get_module_path()}.{dependency_name}" + caller_id = f"{self._get_relative_path()}::{caller_name}" + callee_id = f"{self._get_relative_path()}::{dependency_name}" relationship = CallRelationship( caller=caller_id, @@ -648,8 +648,7 @@ def _get_relative_path(self) -> str: return str(self.file_path) def _get_component_id(self, name: str) -> str: - module_path = self._get_module_path() - return f"{module_path}.{name}" + return f"{self._get_relative_path()}::{name}" def _extract_inheritance(self, node) -> List[str]: """Extract inheritance/implementation relationships.""" @@ -920,8 +919,8 @@ def _resolve_to_top_level(self, entity_name: str, all_entities: dict) -> Optiona return entity_name if entity_name in self.top_level_nodes else None def _add_relationship(self, caller_name: str, callee_name: str, call_line: int) -> None: - caller_id = f"{self._get_module_path()}.{caller_name}" - callee_id = f"{self._get_module_path()}.{callee_name}" + caller_id = f"{self._get_relative_path()}::{caller_name}" + callee_id = f"{self._get_relative_path()}::{callee_name}" relationship = CallRelationship( caller=caller_id, diff --git a/codewiki/src/be/dependency_analyzer/ast_parser.py b/codewiki/src/be/dependency_analyzer/ast_parser.py index 81ac0bdc..4c50ea59 100644 --- a/codewiki/src/be/dependency_analyzer/ast_parser.py +++ b/codewiki/src/be/dependency_analyzer/ast_parser.py @@ -97,8 +97,12 @@ def _build_components_from_analysis(self, call_graph_result: Dict): if legacy_id and legacy_id != component_id: component_id_mapping[legacy_id] = component_id - if "." in component_id: - module_parts = component_id.split(".")[:-1] + if "::" in component_id: + file_path_part = component_id.split("::")[0] + if file_path_part: + self.modules.add(file_path_part) + elif "." in component_id: + module_parts = component_id.split(".")[:-1] module_path = ".".join(module_parts) if module_path: self.modules.add(module_path) diff --git a/codewiki/src/be/documentation_generator.py b/codewiki/src/be/documentation_generator.py index 261be616..234712cd 100644 --- a/codewiki/src/be/documentation_generator.py +++ b/codewiki/src/be/documentation_generator.py @@ -143,6 +143,9 @@ async def generate_module_documentation(self, components: Dict[str, Any], leaf_n if len(module_tree) > 0: for module_path, module_name in processing_order: try: + # Reload module tree to get latest hierarchical structure from sub-agent modifications + module_tree = file_manager.load_json(module_tree_path) + # Get the module info from the tree module_info = module_tree for path_part in module_path: diff --git a/codewiki/src/be/prompt_template.py b/codewiki/src/be/prompt_template.py index f6da5f8b..f374315f 100644 --- a/codewiki/src/be/prompt_template.py +++ b/codewiki/src/be/prompt_template.py @@ -265,12 +265,26 @@ def _format_module_tree(module_tree: dict[str, any], indent: int = 0): lines.append(f"{' ' * indent}{key} (current module)") else: lines.append(f"{' ' * indent}{key}") - - lines.append(f"{' ' * (indent + 1)} Core components: {', '.join(value['components'])}") + + # Group components by file + from collections import defaultdict + by_file = defaultdict(list) + for c in value['components']: + if "::" in c: + fpath, name = c.split("::", 1) + by_file[fpath].append(name) + else: + by_file[""].append(c) + for fpath, names in by_file.items(): + if fpath: + lines.append(f"{' ' * (indent + 1)} {fpath}: {', '.join(names)}") + else: + lines.append(f"{' ' * (indent + 1)} {', '.join(names)}") + if isinstance(value["children"], dict) and len(value["children"]) > 0: lines.append(f"{' ' * (indent + 1)} Children:") _format_module_tree(value["children"], indent + 2) - + _format_module_tree(module_tree, 0) formatted_module_tree = "\n".join(lines) @@ -326,7 +340,21 @@ def _format_module_tree(module_tree: dict[str, any], indent: int = 0): else: lines.append(f"{' ' * indent}{key}") - lines.append(f"{' ' * (indent + 1)} Core components: {', '.join(value['components'])}") + # Group components by file + from collections import defaultdict + by_file = defaultdict(list) + for c in value['components']: + if "::" in c: + fpath, name = c.split("::", 1) + by_file[fpath].append(name) + else: + by_file[""].append(c) + for fpath, names in by_file.items(): + if fpath: + lines.append(f"{' ' * (indent + 1)} {fpath}: {', '.join(names)}") + else: + lines.append(f"{' ' * (indent + 1)} {', '.join(names)}") + if ("children" in value) and isinstance(value["children"], dict) and len(value["children"]) > 0: lines.append(f"{' ' * (indent + 1)} Children:") _format_module_tree(value["children"], indent + 2) From 81827f01ae8645adc4639e13a5d52881543e8a93 Mon Sep 17 00:00:00 2001 From: Nghi Bui Date: Fri, 3 Apr 2026 23:05:08 -0700 Subject: [PATCH 8/9] Add Azure OpenAI support (#49) Add azure-openai as a new provider option, using the AzureOpenAI client from the openai package. Users can configure via --provider azure-openai with --azure-deployment and --api-version options. Co-Authored-By: Claude Opus 4.6 (1M context) --- codewiki/cli/commands/config.py | 54 +++++++++++++++++++++++++++++---- codewiki/cli/config_manager.py | 12 ++++++-- codewiki/cli/models/config.py | 12 +++++++- codewiki/src/be/llm_services.py | 37 +++++++++++++++++++++- codewiki/src/config.py | 12 ++++++-- 5 files changed, 115 insertions(+), 12 deletions(-) diff --git a/codewiki/cli/commands/config.py b/codewiki/cli/commands/config.py index 5ef6d99c..63df5609 100644 --- a/codewiki/cli/commands/config.py +++ b/codewiki/cli/commands/config.py @@ -85,7 +85,7 @@ def config_group(): ) @click.option( "--provider", - type=click.Choice(['openai-compatible', 'anthropic', 'bedrock'], case_sensitive=False), + type=click.Choice(['openai-compatible', 'anthropic', 'bedrock', 'azure-openai'], case_sensitive=False), help="LLM provider type (default: openai-compatible)" ) @click.option( @@ -93,6 +93,16 @@ def config_group(): type=str, help="AWS region for Bedrock provider (default: us-east-1)" ) +@click.option( + "--api-version", + type=str, + help="Azure OpenAI API version (default: 2024-12-01-preview)" +) +@click.option( + "--azure-deployment", + type=str, + help="Azure OpenAI deployment name" +) def config_set( api_key: Optional[str], base_url: Optional[str], @@ -104,7 +114,9 @@ def config_set( max_token_per_leaf_module: Optional[int], max_depth: Optional[int], provider: Optional[str] = None, - aws_region: Optional[str] = None + aws_region: Optional[str] = None, + api_version: Optional[str] = None, + azure_deployment: Optional[str] = None ): """ Set configuration values for CodeWiki. @@ -139,7 +151,7 @@ def config_set( """ try: # Check if at least one option is provided - if not any([api_key, base_url, main_model, cluster_model, fallback_model, max_tokens, max_token_per_module, max_token_per_leaf_module, max_depth, provider, aws_region]): + if not any([api_key, base_url, main_model, cluster_model, fallback_model, max_tokens, max_token_per_module, max_token_per_leaf_module, max_depth, provider, aws_region, api_version, azure_deployment]): click.echo("No options provided. Use --help for usage information.") sys.exit(EXIT_CONFIG_ERROR) @@ -187,6 +199,12 @@ def config_set( if aws_region is not None: validated_data['aws_region'] = aws_region + if api_version is not None: + validated_data['api_version'] = api_version + + if azure_deployment is not None: + validated_data['azure_deployment'] = azure_deployment + # Create config manager and save manager = ConfigManager() manager.load() # Load existing config if present @@ -202,7 +220,9 @@ def config_set( max_token_per_leaf_module=validated_data.get('max_token_per_leaf_module'), max_depth=validated_data.get('max_depth'), provider=validated_data.get('provider'), - aws_region=validated_data.get('aws_region') + aws_region=validated_data.get('aws_region'), + api_version=validated_data.get('api_version'), + azure_deployment=validated_data.get('azure_deployment') ) # Display success messages @@ -256,7 +276,13 @@ def config_set( if aws_region: click.secho(f"✓ AWS Region: {aws_region}", fg="green") - + + if api_version: + click.secho(f"✓ API Version: {api_version}", fg="green") + + if azure_deployment: + click.secho(f"✓ Azure Deployment: {azure_deployment}", fg="green") + click.echo("\n" + click.style("Configuration updated successfully.", fg="green", bold=True)) except ConfigurationError as e: @@ -342,6 +368,12 @@ def config_show(output_json: bool): click.echo(f" Main Model: {config.main_model or 'Not set'}") click.echo(f" Cluster Model: {config.cluster_model or 'Not set'}") click.echo(f" Fallback Model: {config.fallback_model or 'Not set'}") + click.echo(f" Provider: {config.provider}") + if config.provider == "bedrock": + click.echo(f" AWS Region: {config.aws_region}") + elif config.provider == "azure-openai": + click.echo(f" API Version: {config.api_version}") + click.echo(f" Azure Deployment: {config.azure_deployment or 'Not set'}") else: click.secho(" Not configured", fg="yellow") @@ -523,7 +555,17 @@ def config_validate(quick: bool, verbose: bool): try: base_url_lower = (config.base_url or "").lower() - if "api.anthropic.com" in base_url_lower: + provider = getattr(config, 'provider', 'openai-compatible') + if provider == "azure-openai" or ".openai.azure.com" in base_url_lower: + # Use Azure OpenAI SDK + from openai import AzureOpenAI + client = AzureOpenAI( + api_key=api_key, + api_version=config.api_version, + azure_endpoint=config.base_url, + ) + client.models.list() + elif "api.anthropic.com" in base_url_lower: # Use Anthropic SDK for native Anthropic endpoints import anthropic client = anthropic.Anthropic(api_key=api_key) diff --git a/codewiki/cli/config_manager.py b/codewiki/cli/config_manager.py index a652e405..a87df025 100644 --- a/codewiki/cli/config_manager.py +++ b/codewiki/cli/config_manager.py @@ -133,7 +133,9 @@ def save( max_token_per_leaf_module: Optional[int] = None, max_depth: Optional[int] = None, provider: Optional[str] = None, - aws_region: Optional[str] = None + aws_region: Optional[str] = None, + api_version: Optional[str] = None, + azure_deployment: Optional[str] = None ): """ Save configuration to file and keyring. @@ -149,8 +151,10 @@ def save( max_token_per_module: Maximum tokens per module for clustering max_token_per_leaf_module: Maximum tokens per leaf module max_depth: Maximum depth for hierarchical decomposition - provider: LLM provider type (openai-compatible, anthropic, bedrock) + provider: LLM provider type (openai-compatible, anthropic, bedrock, azure-openai) aws_region: AWS region for Bedrock provider + api_version: Azure OpenAI API version + azure_deployment: Azure OpenAI deployment name """ # Ensure config directory exists try: @@ -196,6 +200,10 @@ def save( self._config.provider = provider if aws_region is not None: self._config.aws_region = aws_region + if api_version is not None: + self._config.api_version = api_version + if azure_deployment is not None: + self._config.azure_deployment = azure_deployment # Validate configuration (only if base fields are set) if self._config.base_url and self._config.main_model and self._config.cluster_model: diff --git a/codewiki/cli/models/config.py b/codewiki/cli/models/config.py index 8bfe9152..3f9e0499 100644 --- a/codewiki/cli/models/config.py +++ b/codewiki/cli/models/config.py @@ -113,8 +113,10 @@ class Configuration: cluster_model: Model for module clustering fallback_model: Fallback model for documentation generation default_output: Default output directory - provider: LLM provider type (openai-compatible, anthropic, bedrock) + provider: LLM provider type (openai-compatible, anthropic, bedrock, azure-openai) aws_region: AWS region for Bedrock provider + api_version: Azure OpenAI API version + azure_deployment: Azure OpenAI deployment name max_tokens: Maximum tokens for LLM response (default: 32768) max_token_per_module: Maximum tokens per module for clustering (default: 36369) max_token_per_leaf_module: Maximum tokens per leaf module (default: 16000) @@ -128,6 +130,8 @@ class Configuration: default_output: str = "docs" provider: str = "openai-compatible" aws_region: str = "us-east-1" + api_version: str = "2024-12-01-preview" + azure_deployment: str = "" max_tokens: int = 32768 max_token_per_module: int = 36369 max_token_per_leaf_module: int = 16000 @@ -155,6 +159,8 @@ def to_dict(self) -> dict: 'default_output': self.default_output, 'provider': self.provider, 'aws_region': self.aws_region, + 'api_version': self.api_version, + 'azure_deployment': self.azure_deployment, 'max_tokens': self.max_tokens, 'max_token_per_module': self.max_token_per_module, 'max_token_per_leaf_module': self.max_token_per_leaf_module, @@ -187,6 +193,8 @@ def from_dict(cls, data: dict) -> 'Configuration': default_output=data.get('default_output', 'docs'), provider=data.get('provider', 'openai-compatible'), aws_region=data.get('aws_region', 'us-east-1'), + api_version=data.get('api_version', '2024-12-01-preview'), + azure_deployment=data.get('azure_deployment', ''), max_tokens=data.get('max_tokens', 32768), max_token_per_module=data.get('max_token_per_module', 36369), max_token_per_leaf_module=data.get('max_token_per_leaf_module', 16000), @@ -243,6 +251,8 @@ def to_backend_config(self, repo_path: str, output_dir: str, api_key: str, runti fallback_model=self.fallback_model, provider=self.provider, aws_region=self.aws_region, + api_version=self.api_version, + azure_deployment=self.azure_deployment, max_tokens=self.max_tokens, max_token_per_module=self.max_token_per_module, max_token_per_leaf_module=self.max_token_per_leaf_module, diff --git a/codewiki/src/be/llm_services.py b/codewiki/src/be/llm_services.py index 0b03a54c..db3437a7 100644 --- a/codewiki/src/be/llm_services.py +++ b/codewiki/src/be/llm_services.py @@ -4,7 +4,7 @@ Includes a compatibility layer for OpenAI-compatible API proxies that may return slightly non-standard responses (e.g. choices[].index = None). -Supports multiple providers: openai-compatible, anthropic, bedrock. +Supports multiple providers: openai-compatible, anthropic, bedrock, azure-openai. """ import logging from openai.types import chat @@ -174,6 +174,9 @@ def call_llm( if provider in ("bedrock", "anthropic"): return _call_llm_via_litellm(prompt, config, model, temperature) + if provider == "azure-openai": + return _call_llm_via_azure(prompt, config, model, temperature) + # Default: OpenAI-compatible client = create_openai_client(config) @@ -225,3 +228,35 @@ def _call_llm_via_litellm( api_key=config.llm_api_key if config.provider != "bedrock" else None, ) return response.choices[0].message.content + + +def _call_llm_via_azure( + prompt: str, + config: Config, + model: str, + temperature: float = 0.0 +) -> str: + """ + Call LLM via Azure OpenAI. + + Uses the AzureOpenAI client from the openai package with + azure_endpoint, api_version, and deployment name. + """ + from openai import AzureOpenAI + + client = AzureOpenAI( + api_key=config.llm_api_key, + api_version=config.api_version, + azure_endpoint=config.llm_base_url, + ) + + deployment = config.azure_deployment or model + logger.debug("Calling Azure OpenAI deployment %s (api_version=%s)", deployment, config.api_version) + + response = client.chat.completions.create( + model=deployment, + messages=[{"role": "user", "content": prompt}], + temperature=temperature, + max_tokens=config.max_tokens, + ) + return response.choices[0].message.content diff --git a/codewiki/src/config.py b/codewiki/src/config.py index 42757788..120ac2bd 100644 --- a/codewiki/src/config.py +++ b/codewiki/src/config.py @@ -58,8 +58,10 @@ class Config: cluster_model: str fallback_model: str = FALLBACK_MODEL_1 # Provider configuration - provider: str = "openai-compatible" # openai-compatible, anthropic, bedrock + provider: str = "openai-compatible" # openai-compatible, anthropic, bedrock, azure-openai aws_region: str = "us-east-1" + api_version: str = "2024-12-01-preview" # Azure OpenAI API version + azure_deployment: str = "" # Azure OpenAI deployment name # Max token settings max_tokens: int = DEFAULT_MAX_TOKENS max_token_per_module: int = DEFAULT_MAX_TOKEN_PER_MODULE @@ -160,6 +162,8 @@ def from_cli( fallback_model: str = FALLBACK_MODEL_1, provider: str = "openai-compatible", aws_region: str = "us-east-1", + api_version: str = "2024-12-01-preview", + azure_deployment: str = "", max_tokens: int = DEFAULT_MAX_TOKENS, max_token_per_module: int = DEFAULT_MAX_TOKEN_PER_MODULE, max_token_per_leaf_module: int = DEFAULT_MAX_TOKEN_PER_LEAF_MODULE, @@ -177,8 +181,10 @@ def from_cli( main_model: Primary model cluster_model: Clustering model fallback_model: Fallback model - provider: LLM provider type (openai-compatible, anthropic, bedrock) + provider: LLM provider type (openai-compatible, anthropic, bedrock, azure-openai) aws_region: AWS region for Bedrock provider + api_version: Azure OpenAI API version + azure_deployment: Azure OpenAI deployment name max_tokens: Maximum tokens for LLM response max_token_per_module: Maximum tokens per module for clustering max_token_per_leaf_module: Maximum tokens per leaf module @@ -204,6 +210,8 @@ def from_cli( fallback_model=fallback_model, provider=provider, aws_region=aws_region, + api_version=api_version, + azure_deployment=azure_deployment, max_tokens=max_tokens, max_token_per_module=max_token_per_module, max_token_per_leaf_module=max_token_per_leaf_module, From 738e0c4d0e7e3674915372e67e7dc0779d0bf3e8 Mon Sep 17 00:00:00 2001 From: Nghi Bui Date: Fri, 3 Apr 2026 23:09:11 -0700 Subject: [PATCH 9/9] Update README with Azure OpenAI, Bedrock, incremental updates, and MCP server Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 951812bb..60b82e65 100644 --- a/README.md +++ b/README.md @@ -42,15 +42,32 @@ codewiki --version ### 2. Configure Your Environment -CodeWiki supports multiple models via an OpenAI-compatible SDK layer. +CodeWiki supports multiple LLM providers: **OpenAI-compatible**, **Anthropic**, **AWS Bedrock**, and **Azure OpenAI**. ```bash +# Anthropic codewiki config set \ --api-key YOUR_API_KEY \ --base-url https://api.anthropic.com \ --main-model claude-sonnet-4 \ --cluster-model claude-sonnet-4 \ --fallback-model glm-4p5 + +# Azure OpenAI +codewiki config set \ + --provider azure-openai \ + --api-key YOUR_AZURE_KEY \ + --base-url https://YOUR_RESOURCE.openai.azure.com \ + --azure-deployment YOUR_DEPLOYMENT \ + --main-model gpt-4o \ + --cluster-model gpt-4o + +# AWS Bedrock +codewiki config set \ + --provider bedrock \ + --aws-region us-east-1 \ + --main-model anthropic.claude-sonnet-4-v2:0 \ + --cluster-model anthropic.claude-sonnet-4-v2:0 ``` ### 3. Generate Documentation @@ -138,6 +155,9 @@ codewiki generate --verbose # Full-featured generation codewiki generate --create-branch --github-pages --verbose + +# Incremental update (only regenerate changed modules since last run) +codewiki generate --update ``` ### Customization Options @@ -235,7 +255,7 @@ codewiki generate --max-tokens 16384 --max-token-per-module 40000 --max-depth 3 ### Configuration Storage -- **API keys**: Securely stored in system keychain (macOS Keychain, Windows Credential Manager, Linux Secret Service) +- **API keys**: Securely stored in system keychain (macOS Keychain, Windows Credential Manager, Linux Secret Service). Falls back to `~/.codewiki/credentials.json` in headless/container environments. Set `CODEWIKI_NO_KEYRING=1` to force file-based storage. - **Settings & Agent Instructions**: `~/.codewiki/config.json` --- @@ -331,7 +351,7 @@ CodeWiki employs a three-stage process for comprehensive documentation generatio - **Python 3.12+** - **Node.js** (for Mermaid diagram validation) -- **LLM API access** (Anthropic Claude, OpenAI, etc.) +- **LLM API access** (Anthropic Claude, OpenAI, Azure OpenAI, AWS Bedrock) - **Git** (for branch creation features) --- @@ -339,6 +359,7 @@ CodeWiki employs a three-stage process for comprehensive documentation generatio ## Additional Resources ### Documentation & Guides +- **[MCP Server](codewiki/mcp/)** - Model Context Protocol server for IDE integrations - **[Docker Deployment](docker/DOCKER_README.md)** - Containerized deployment instructions - **[Development Guide](DEVELOPMENT.md)** - Project structure, architecture, and contributing guidelines - **[CodeWikiBench](https://github.com/FSoft-AI4Code/CodeWikiBench)** - Repository-level documentation benchmark