Skip to content

Commit 42d4ac5

Browse files
committed
fix(core): Add segment report noise suppression + CLI UX flags
- Merge overlapping segment windows in reports and suppress boilerplate-only groups - Enforce segment complexity threshold and report suppressed count - Add --version and --cache-path (legacy --cache-dir), tighten help text - Update HTML rendering for single-item groups - Extend tests and docs; keep CI/baseline logic unchanged
1 parent b1a0f6a commit 42d4ac5

11 files changed

Lines changed: 854 additions & 37 deletions

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ workflows that rely on existing baselines.
3636
- **Candidate generation**
3737
Used an order‑insensitive signature for candidate grouping and a strict segment hash for
3838
final confirmation; segment matches do not affect baseline or CI failure logic.
39+
- **Noise reduction (report‑only)**
40+
Merged overlapping segment windows into a single span per function and suppressed
41+
boilerplate‑only groups (attribute assignment wiring) using deterministic AST criteria.
3942

4043
### Baseline & CI
4144

@@ -52,6 +55,8 @@ codeclone . --update-baseline
5255

5356
### CLI UX (CI)
5457

58+
- Added `--version` for standard version output.
59+
- Added `--cache-path` (legacy alias: `--cache-dir`) and clarified cache help text.
5560
- Added `--ci` preset (`--fail-on-new --no-color --quiet`).
5661
- Improved `--fail-on-new` output with aggregated counts and clear next steps.
5762
- Validate report output extensions (`.html`, `.json`, `.txt`) and fail fast on mismatches.

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,12 @@ Generate an HTML report:
146146
codeclone . --html .cache/codeclone/report.html
147147
```
148148

149+
Check version:
150+
151+
```bash
152+
codeclone --version
153+
```
154+
149155
Run in CI mode:
150156

151157
```bash
@@ -191,7 +197,7 @@ By default, CodeClone stores the cache per project at:
191197

192198
`<root>/.cache/codeclone/cache.json`
193199

194-
You can override this path with `--cache-dir`.
200+
You can override this path with `--cache-path` (`--cache-dir` is a legacy alias).
195201

196202
If you used an older version of CodeClone, delete the legacy cache file at
197203
`~/.cache/codeclone/cache.json` and add `.cache/` to `.gitignore`.

codeclone/cli.py

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
build_block_groups,
3232
build_groups,
3333
build_segment_groups,
34+
prepare_segment_report_groups,
3435
to_json_report,
3536
to_text,
3637
)
@@ -47,6 +48,14 @@
4748
}
4849
)
4950

51+
52+
class _HelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
53+
def _get_help_string(self, action: argparse.Action) -> str:
54+
if action.dest == "cache_path":
55+
return action.help or ""
56+
return cast(str, super()._get_help_string(action))
57+
58+
5059
LEGACY_CACHE_PATH = Path("~/.cache/codeclone/cache.json").expanduser()
5160

5261

@@ -176,7 +185,13 @@ def main() -> None:
176185
ap = argparse.ArgumentParser(
177186
prog="codeclone",
178187
description="AST and CFG-based code clone detector for Python.",
179-
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
188+
formatter_class=_HelpFormatter,
189+
)
190+
ap.add_argument(
191+
"--version",
192+
action="version",
193+
version=f"CodeClone {__version__}",
194+
help="Print the CodeClone version and exit.",
180195
)
181196

182197
# Core Arguments
@@ -208,13 +223,19 @@ def main() -> None:
208223
default=4,
209224
help="Number of parallel worker processes.",
210225
)
226+
tune_group.add_argument(
227+
"--cache-path",
228+
dest="cache_path",
229+
metavar="FILE",
230+
default=None,
231+
help="Path to the cache file. Default: <root>/.cache/codeclone/cache.json.",
232+
)
211233
tune_group.add_argument(
212234
"--cache-dir",
213-
default=".cache/codeclone/cache.json",
214-
help=(
215-
"Path to the cache file to speed up subsequent runs. "
216-
"Defaults to <root>/.cache/codeclone/cache.json."
217-
),
235+
dest="cache_path",
236+
metavar="FILE",
237+
default=None,
238+
help="Legacy alias for --cache-path.",
218239
)
219240

220241
# Baseline & CI
@@ -239,7 +260,10 @@ def main() -> None:
239260
type=int,
240261
default=-1,
241262
metavar="MAX_CLONES",
242-
help="Exit with error if total clone groups exceed this number.",
263+
help=(
264+
"Exit with error if total clone groups (function + block) "
265+
"exceed this number."
266+
),
243267
)
244268
ci_group.add_argument(
245269
"--ci",
@@ -288,8 +312,10 @@ def main() -> None:
288312
help="Print detailed hash identifiers for new clones.",
289313
)
290314

291-
cache_dir_from_args = any(
292-
arg == "--cache-dir" or arg.startswith("--cache-dir=") for arg in sys.argv
315+
cache_path_from_args = any(
316+
arg in {"--cache-dir", "--cache-path"}
317+
or arg.startswith(("--cache-dir=", "--cache-path="))
318+
for arg in sys.argv
293319
)
294320
args = ap.parse_args()
295321

@@ -337,8 +363,8 @@ def main() -> None:
337363

338364
# Initialize Cache
339365
cfg = NormalizationConfig()
340-
if cache_dir_from_args:
341-
cache_path = Path(args.cache_dir).expanduser()
366+
if cache_path_from_args and args.cache_path:
367+
cache_path = Path(args.cache_path).expanduser()
342368
else:
343369
cache_path = root_path / ".cache" / "codeclone" / "cache.json"
344370
if LEGACY_CACHE_PATH.exists():
@@ -601,10 +627,14 @@ def process_sequential(with_progress: bool) -> None:
601627
console.print(f" ... and {len(failed_files) - 10} more")
602628

603629
# Analysis phase
630+
suppressed_segment_groups = 0
604631
if args.quiet:
605632
func_groups = build_groups(all_units)
606633
block_groups = build_block_groups(all_blocks)
607634
segment_groups = build_segment_groups(all_segments)
635+
segment_groups, suppressed_segment_groups = prepare_segment_report_groups(
636+
segment_groups
637+
)
608638
try:
609639
cache.save()
610640
except CacheError as e:
@@ -614,6 +644,9 @@ def process_sequential(with_progress: bool) -> None:
614644
func_groups = build_groups(all_units)
615645
block_groups = build_block_groups(all_blocks)
616646
segment_groups = build_segment_groups(all_segments)
647+
segment_groups, suppressed_segment_groups = prepare_segment_report_groups(
648+
segment_groups
649+
)
617650
try:
618651
cache.save()
619652
except CacheError as e:
@@ -716,6 +749,8 @@ def process_sequential(with_progress: bool) -> None:
716749
table.add_row("Total Function Clones", str(func_clones_count))
717750
table.add_row("Total Block Clones", str(block_clones_count))
718751
table.add_row("Total Segment Clones", str(segment_clones_count))
752+
if suppressed_segment_groups > 0:
753+
table.add_row("Suppressed Segment Groups", str(suppressed_segment_groups))
719754

720755
if baseline_exists:
721756
style = "error" if new_clones_count > 0 else "success"

codeclone/html_report.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def __init__(self, maxsize: int = 128) -> None:
6666

6767
@staticmethod
6868
def _read_file_range(
69-
filepath: str, start_line: int, end_line: int
69+
filepath: str, start_line: int, end_line: int
7070
) -> tuple[str, ...]:
7171
if start_line < 1:
7272
start_line = 1
@@ -94,7 +94,7 @@ def _read_with_errors(errors: str) -> tuple[str, ...]:
9494
raise FileProcessingError(f"Cannot read {filepath}: {e}") from e
9595

9696
def get_lines_range(
97-
self, filepath: str, start_line: int, end_line: int
97+
self, filepath: str, start_line: int, end_line: int
9898
) -> tuple[str, ...]:
9999
return self._get_lines_impl(filepath, start_line, end_line)
100100

@@ -179,13 +179,13 @@ def _prefix_css(css: str, prefix: str) -> str:
179179

180180

181181
def _render_code_block(
182-
*,
183-
filepath: str,
184-
start_line: int,
185-
end_line: int,
186-
file_cache: _FileCache,
187-
context: int,
188-
max_lines: int,
182+
*,
183+
filepath: str,
184+
start_line: int,
185+
end_line: int,
186+
file_cache: _FileCache,
187+
context: int,
188+
max_lines: int,
189189
) -> _Snippet:
190190
s = max(1, start_line - context)
191191
e = end_line + context
@@ -237,13 +237,13 @@ def _group_sort_key(items: list[dict[str, Any]]) -> tuple[int, int]:
237237

238238

239239
def build_html_report(
240-
*,
241-
func_groups: dict[str, list[dict[str, Any]]],
242-
block_groups: dict[str, list[dict[str, Any]]],
243-
segment_groups: dict[str, list[dict[str, Any]]],
244-
title: str = "CodeClone Report",
245-
context_lines: int = 3,
246-
max_snippet_lines: int = 220,
240+
*,
241+
func_groups: dict[str, list[dict[str, Any]]],
242+
block_groups: dict[str, list[dict[str, Any]]],
243+
segment_groups: dict[str, list[dict[str, Any]]],
244+
title: str = "CodeClone Report",
245+
context_lines: int = 3,
246+
max_snippet_lines: int = 220,
247247
) -> str:
248248
file_cache = _FileCache()
249249

@@ -330,10 +330,10 @@ def _svg_icon(size: int, stroke_width: str, body: str) -> str:
330330
# ----------------------------
331331

332332
def render_section(
333-
section_id: str,
334-
section_title: str,
335-
groups: list[tuple[str, list[dict[str, Any]]]],
336-
pill_cls: str,
333+
section_id: str,
334+
section_title: str,
335+
groups: list[tuple[str, list[dict[str, Any]]]],
336+
pill_cls: str,
337337
) -> str:
338338
if not groups:
339339
return ""
@@ -423,10 +423,11 @@ def render_section(
423423

424424
out.append(f'<div class="items" id="group-body-{section_id}-{idx}">')
425425

426-
for a, b in pairwise(items):
426+
for i in range(0, len(items), 2):
427+
row_items = items[i : i + 2]
427428
out.append('<div class="item-pair">')
428429

429-
for item in (a, b):
430+
for item in row_items:
430431
snippet = _render_code_block(
431432
filepath=item["filepath"],
432433
start_line=int(item["start_line"]),

0 commit comments

Comments
 (0)