feat(version-scanner): support globbing and subpath patterns in ignore file#17539
feat(version-scanner): support globbing and subpath patterns in ignore file#17539chalmerlowe wants to merge 13 commits into
Conversation
…g under _safe_read_file
…nd hybrid packages
There was a problem hiding this comment.
Code Review
This pull request introduces a new _should_ignore helper function in version_scanner.py to handle wildcard and path-based ignore patterns during repository scanning, along with corresponding unit tests. The review feedback highlights critical correctness and performance issues in the implementation of _should_ignore. Specifically, it identifies a performance overhead from redundant string operations on every file, a trailing slash bug where directory-only patterns fail to match, and a leading slash anchoring bug that incorrectly matches subpaths. The reviewer provides actionable suggestions to resolve these issues by preprocessing and caching patterns using functools.lru_cache, introducing an is_dir parameter to distinguish files from directories, and adding robust test cases to verify these edge cases.
| def _should_ignore(rel_path: str, name: str, ignore_patterns: List[str]) -> bool: | ||
| """Check if a file or directory matches any of the ignore patterns.""" | ||
| if not ignore_patterns: | ||
| return False | ||
| name_lower = name.lower() | ||
| rel_path_norm = rel_path.replace(os.sep, '/').lower() | ||
|
|
||
| for pattern in ignore_patterns: | ||
| pattern_lower = pattern.lower() | ||
| if '/' in pattern: | ||
| if pattern_lower.startswith('/'): | ||
| p = pattern_lower[1:] | ||
| else: | ||
| p = pattern_lower | ||
| if fnmatch.fnmatchcase(rel_path_norm, p) or fnmatch.fnmatchcase(rel_path_norm, f"*/{p}"): | ||
| return True | ||
| else: | ||
| if fnmatch.fnmatchcase(name_lower, pattern_lower): | ||
| return True | ||
| return False |
There was a problem hiding this comment.
Performance and Correctness Improvements in _should_ignore
There are three key issues with the current implementation of _should_ignore:
- Performance Overhead:
_should_ignoreis called for every single file and directory duringos.walk. Currently, it performs string operations (.lower(),'/' in pattern,.startswith('/')) on every pattern for every single file/directory. Preprocessing the patterns once usingfunctools.lru_cacheavoids this redundant work. - Trailing Slash Bug: If a pattern ends with a slash (e.g.,
build/), it is meant to match only directories. However, since directory names passed to_should_ignoredo not have a trailing slash,fnmatch.fnmatchcase("build", "build/")will returnFalse, causing the directory to not be ignored. - Leading Slash Anchoring Bug: A pattern starting with a slash (e.g.,
/packages/pkg_a/.nox) is meant to be anchored to the root. However, the current code strips the leading slash and checks bothpand*/p, which incorrectly matches subpaths likesub/packages/pkg_a/.nox.
We can resolve all three issues by:
- Adding an
is_dirparameter to_should_ignore. - Preprocessing and caching the ignore patterns into structured tuples using
functools.lru_cache.
@functools.lru_cache(maxsize=32)
def _preprocess_patterns(patterns: tuple[str, ...]) -> list[tuple[bool, bool, bool, str]]:
processed = []
for pattern in patterns:
pattern_lower = pattern.lower()
is_dir_only = False
if pattern_lower.endswith('/'):
is_dir_only = True
pattern_lower = pattern_lower[:-1]
if '/' in pattern_lower:
if pattern_lower.startswith('/'):
p = pattern_lower[1:]
is_anchored = True
else:
p = pattern_lower
is_anchored = False
processed.append((True, is_dir_only, is_anchored, p))
else:
processed.append((False, is_dir_only, False, pattern_lower))
return processed
def _should_ignore(rel_path: str, name: str, ignore_patterns: List[str], is_dir: bool = False) -> bool:
"""Check if a file or directory matches any of the ignore patterns."""
if not ignore_patterns:
return False
processed_ignores = _preprocess_patterns(tuple(ignore_patterns))
name_lower = name.lower()
rel_path_norm = rel_path.replace(os.sep, '/').lower()
for is_path_pattern, is_dir_only, is_anchored, pattern in processed_ignores:
if is_dir_only and not is_dir:
continue
if is_path_pattern:
if is_anchored:
if fnmatch.fnmatchcase(rel_path_norm, pattern):
return True
else:
if fnmatch.fnmatchcase(rel_path_norm, pattern) or fnmatch.fnmatchcase(rel_path_norm, f"*/{pattern}"):
return True
else:
if fnmatch.fnmatchcase(name_lower, pattern):
return True
return False| # Prune ignore directories (case-insensitive) | ||
| dirs[:] = [d for d in dirs if d.lower() not in ignore_lower] | ||
| dirs[:] = [ | ||
| d for d in dirs | ||
| if not _should_ignore(get_rel_path(d), d, ignore_dirs) | ||
| ] | ||
|
|
||
| # Filter ignore files (case-insensitive) | ||
| files = [f for f in files if f.lower() not in ignore_lower] | ||
|
|
||
| rel_root = os.path.relpath(root, root_path) | ||
| files = [ | ||
| f for f in files | ||
| if not _should_ignore(get_rel_path(f), f, ignore_dirs) | ||
| ] |
There was a problem hiding this comment.
Pass is_dir=True and is_dir=False to _should_ignore to correctly distinguish between directories and files, enabling proper trailing-slash directory-only matching.
| # Prune ignore directories (case-insensitive) | |
| dirs[:] = [d for d in dirs if d.lower() not in ignore_lower] | |
| dirs[:] = [ | |
| d for d in dirs | |
| if not _should_ignore(get_rel_path(d), d, ignore_dirs) | |
| ] | |
| # Filter ignore files (case-insensitive) | |
| files = [f for f in files if f.lower() not in ignore_lower] | |
| rel_root = os.path.relpath(root, root_path) | |
| files = [ | |
| f for f in files | |
| if not _should_ignore(get_rel_path(f), f, ignore_dirs) | |
| ] | |
| # Prune ignore directories (case-insensitive) | |
| dirs[:] = [ | |
| d for d in dirs | |
| if not _should_ignore(get_rel_path(d), d, ignore_dirs, is_dir=True) | |
| ] | |
| # Filter ignore files (case-insensitive) | |
| files = [ | |
| f for f in files | |
| if not _should_ignore(get_rel_path(f), f, ignore_dirs, is_dir=False) | |
| ] |
| import argparse | ||
| import csv | ||
| import datetime | ||
| import fnmatch |
| def test__should_ignore(): | ||
| from version_scanner import _should_ignore | ||
|
|
||
| ignore_patterns = [ | ||
| ".git", | ||
| "*.jpg", | ||
| "packages/pkg_a/.nox", | ||
| "*.egg-info" | ||
| ] | ||
|
|
||
| # Exact match | ||
| assert _should_ignore(".git", ".git", ignore_patterns) is True | ||
| # Case insensitivity | ||
| assert _should_ignore(".GIT", ".GIT", ignore_patterns) is True | ||
| # Wildcard match | ||
| assert _should_ignore("some/path/image.jpg", "image.jpg", ignore_patterns) is True | ||
| assert _should_ignore("image.JPG", "image.JPG", ignore_patterns) is True | ||
| # Subpath match | ||
| assert _should_ignore("packages/pkg_a/.nox", ".nox", ignore_patterns) is True | ||
| # Wildcard directory match | ||
| assert _should_ignore("google_cloud_pubsub.egg-info", "google_cloud_pubsub.egg-info", ignore_patterns) is True | ||
| # Negative match | ||
| assert _should_ignore("setup.py", "setup.py", ignore_patterns) is False |
There was a problem hiding this comment.
Add test cases to verify trailing slash directory-only matching and leading slash anchoring.
def test__should_ignore():
ignore_patterns = [
".git",
"*.jpg",
"packages/pkg_a/.nox",
"*.egg-info",
"build/",
"/anchored_dir"
]
# Exact match
assert _should_ignore(".git", ".git", ignore_patterns) is True
# Case insensitivity
assert _should_ignore(".GIT", ".GIT", ignore_patterns) is True
# Wildcard match
assert _should_ignore("some/path/image.jpg", "image.jpg", ignore_patterns) is True
assert _should_ignore("image.JPG", "image.JPG", ignore_patterns) is True
# Subpath match
assert _should_ignore("packages/pkg_a/.nox", ".nox", ignore_patterns) is True
# Wildcard directory match
assert _should_ignore("google_cloud_pubsub.egg-info", "google_cloud_pubsub.egg-info", ignore_patterns) is True
# Trailing slash directory-only match
assert _should_ignore("build", "build", ignore_patterns, is_dir=True) is True
assert _should_ignore("build", "build", ignore_patterns, is_dir=False) is False
# Leading slash anchoring match
assert _should_ignore("anchored_dir", "anchored_dir", ignore_patterns) is True
assert _should_ignore("sub/anchored_dir", "anchored_dir", ignore_patterns) is False
# Negative match
assert _should_ignore("setup.py", "setup.py", ignore_patterns) is FalseReferences
- Do not import modules or classes inside functions if they are already imported at the module level.
04f3d2d to
69e81ab
Compare
…on-generated-packages
69e81ab to
111c152
Compare
…loat versions, and argument validation
111c152 to
055975c
Compare
This PR upgrades the ignore engine of the version scanner to support wildcard patterns (like *.jpg and *.egg-info) and relative directory subpaths (like packages/pkg_a/.nox) in .scannerignore.