Skip to content

Commit b138960

Browse files
committed
Speed up filename filtering.
Before there was a `getcwd` syscall for every filename which was filtered. Instead this is now cached per-run. - When all files are identified by filename only: ~45% improvement - When no files are identified by filename only: ~55% improvement This makes little difference to overall execution, the bigger win is eliminating the `memoize_by_cwd` hack. Just removing the memoization would have *increased* the runtime by 300-500%.
1 parent 38308dc commit b138960

File tree

6 files changed

+61
-103
lines changed

6 files changed

+61
-103
lines changed

pre_commit/commands/run.py

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,47 @@
1717
from pre_commit.repository import install_hook_envs
1818
from pre_commit.staged_files_only import staged_files_only
1919
from pre_commit.util import cmd_output
20-
from pre_commit.util import memoize_by_cwd
2120
from pre_commit.util import noop_context
2221

2322

2423
logger = logging.getLogger('pre_commit')
2524

2625

27-
tags_from_path = memoize_by_cwd(tags_from_path)
26+
def filter_by_include_exclude(names, include, exclude):
27+
include_re, exclude_re = re.compile(include), re.compile(exclude)
28+
return [
29+
filename for filename in names
30+
if include_re.search(filename)
31+
if not exclude_re.search(filename)
32+
]
33+
34+
35+
class Classifier(object):
36+
def __init__(self, filenames):
37+
self.filenames = [f for f in filenames if os.path.lexists(f)]
38+
self._types_cache = {}
39+
40+
def _types_for_file(self, filename):
41+
try:
42+
return self._types_cache[filename]
43+
except KeyError:
44+
ret = self._types_cache[filename] = tags_from_path(filename)
45+
return ret
46+
47+
def by_types(self, names, types, exclude_types):
48+
types, exclude_types = frozenset(types), frozenset(exclude_types)
49+
ret = []
50+
for filename in names:
51+
tags = self._types_for_file(filename)
52+
if tags >= types and not tags & exclude_types:
53+
ret.append(filename)
54+
return ret
55+
56+
def filenames_for_hook(self, hook):
57+
names = self.filenames
58+
names = filter_by_include_exclude(names, hook.files, hook.exclude)
59+
names = self.by_types(names, hook.types, hook.exclude_types)
60+
return names
2861

2962

3063
def _get_skips(environ):
@@ -36,37 +69,12 @@ def _hook_msg_start(hook, verbose):
3669
return '{}{}'.format('[{}] '.format(hook.id) if verbose else '', hook.name)
3770

3871

39-
def _filter_by_include_exclude(filenames, include, exclude):
40-
include_re, exclude_re = re.compile(include), re.compile(exclude)
41-
return [
42-
filename for filename in filenames
43-
if (
44-
include_re.search(filename) and
45-
not exclude_re.search(filename) and
46-
os.path.lexists(filename)
47-
)
48-
]
49-
50-
51-
def _filter_by_types(filenames, types, exclude_types):
52-
types, exclude_types = frozenset(types), frozenset(exclude_types)
53-
ret = []
54-
for filename in filenames:
55-
tags = tags_from_path(filename)
56-
if tags >= types and not tags & exclude_types:
57-
ret.append(filename)
58-
return tuple(ret)
59-
60-
6172
SKIPPED = 'Skipped'
6273
NO_FILES = '(no files to check)'
6374

6475

65-
def _run_single_hook(filenames, hook, args, skips, cols):
66-
include, exclude = hook.files, hook.exclude
67-
filenames = _filter_by_include_exclude(filenames, include, exclude)
68-
types, exclude_types = hook.types, hook.exclude_types
69-
filenames = _filter_by_types(filenames, types, exclude_types)
76+
def _run_single_hook(classifier, hook, args, skips, cols):
77+
filenames = classifier.filenames_for_hook(hook)
7078

7179
if hook.language == 'pcre':
7280
logger.warning(
@@ -193,10 +201,11 @@ def _run_hooks(config, hooks, args, environ):
193201
skips = _get_skips(environ)
194202
cols = _compute_cols(hooks, args.verbose)
195203
filenames = _all_filenames(args)
196-
filenames = _filter_by_include_exclude(filenames, '', config['exclude'])
204+
filenames = filter_by_include_exclude(filenames, '', config['exclude'])
205+
classifier = Classifier(filenames)
197206
retval = 0
198207
for hook in hooks:
199-
retval |= _run_single_hook(filenames, hook, args, skips, cols)
208+
retval |= _run_single_hook(classifier, hook, args, skips, cols)
200209
if retval and config['fail_fast']:
201210
break
202211
if retval and args.show_diff_on_failure and git.has_diff():

pre_commit/meta_hooks/check_hooks_apply.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,19 @@
33
import pre_commit.constants as C
44
from pre_commit import git
55
from pre_commit.clientlib import load_config
6-
from pre_commit.commands.run import _filter_by_include_exclude
7-
from pre_commit.commands.run import _filter_by_types
6+
from pre_commit.commands.run import Classifier
87
from pre_commit.repository import all_hooks
98
from pre_commit.store import Store
109

1110

1211
def check_all_hooks_match_files(config_file):
13-
files = git.get_all_files()
12+
classifier = Classifier(git.get_all_files())
1413
retv = 0
1514

1615
for hook in all_hooks(load_config(config_file), Store()):
1716
if hook.always_run or hook.language == 'fail':
1817
continue
19-
include, exclude = hook.files, hook.exclude
20-
filtered = _filter_by_include_exclude(files, include, exclude)
21-
types, exclude_types = hook.types, hook.exclude_types
22-
filtered = _filter_by_types(filtered, types, exclude_types)
23-
if not filtered:
18+
elif not classifier.filenames_for_hook(hook):
2419
print('{} does not apply to this repository'.format(hook.id))
2520
retv = 1
2621

pre_commit/meta_hooks/check_useless_excludes.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pre_commit import git
1010
from pre_commit.clientlib import load_config
1111
from pre_commit.clientlib import MANIFEST_HOOK_DICT
12-
from pre_commit.commands.run import _filter_by_types
12+
from pre_commit.commands.run import Classifier
1313

1414

1515
def exclude_matches_any(filenames, include, exclude):
@@ -24,11 +24,11 @@ def exclude_matches_any(filenames, include, exclude):
2424

2525
def check_useless_excludes(config_file):
2626
config = load_config(config_file)
27-
files = git.get_all_files()
27+
classifier = Classifier(git.get_all_files())
2828
retv = 0
2929

3030
exclude = config['exclude']
31-
if not exclude_matches_any(files, '', exclude):
31+
if not exclude_matches_any(classifier.filenames, '', exclude):
3232
print(
3333
'The global exclude pattern {!r} does not match any files'
3434
.format(exclude),
@@ -40,10 +40,11 @@ def check_useless_excludes(config_file):
4040
# Not actually a manifest dict, but this more accurately reflects
4141
# the defaults applied during runtime
4242
hook = apply_defaults(hook, MANIFEST_HOOK_DICT)
43+
names = classifier.filenames
4344
types, exclude_types = hook['types'], hook['exclude_types']
44-
filtered_by_types = _filter_by_types(files, types, exclude_types)
45+
names = classifier.by_types(names, types, exclude_types)
4546
include, exclude = hook['files'], hook['exclude']
46-
if not exclude_matches_any(filtered_by_types, include, exclude):
47+
if not exclude_matches_any(names, include, exclude):
4748
print(
4849
'The exclude pattern {!r} for {} does not match any files'
4950
.format(exclude, hook['id']),

pre_commit/util.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import contextlib
44
import errno
5-
import functools
65
import os.path
76
import shutil
87
import stat
@@ -31,23 +30,6 @@ def mkdirp(path):
3130
raise
3231

3332

34-
def memoize_by_cwd(func):
35-
"""Memoize a function call based on os.getcwd()."""
36-
@functools.wraps(func)
37-
def wrapper(*args):
38-
cwd = os.getcwd()
39-
key = (cwd,) + args
40-
try:
41-
return wrapper._cache[key]
42-
except KeyError:
43-
ret = wrapper._cache[key] = func(*args)
44-
return ret
45-
46-
wrapper._cache = {}
47-
48-
return wrapper
49-
50-
5133
@contextlib.contextmanager
5234
def clean_path_on_failure(path):
5335
"""Cleans up the directory on an exceptional failure."""

tests/commands/run_test.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@
1111
import pre_commit.constants as C
1212
from pre_commit.commands.install_uninstall import install
1313
from pre_commit.commands.run import _compute_cols
14-
from pre_commit.commands.run import _filter_by_include_exclude
1514
from pre_commit.commands.run import _get_skips
1615
from pre_commit.commands.run import _has_unmerged_paths
16+
from pre_commit.commands.run import Classifier
17+
from pre_commit.commands.run import filter_by_include_exclude
1718
from pre_commit.commands.run import run
1819
from pre_commit.util import cmd_output
1920
from pre_commit.util import make_executable
@@ -748,18 +749,22 @@ def test_fail_fast(cap_out, store, repo_with_failing_hook):
748749
assert printed.count(b'Failing hook') == 1
749750

750751

752+
def test_classifier_removes_dne():
753+
classifier = Classifier(('this_file_does_not_exist',))
754+
assert classifier.filenames == []
755+
756+
751757
@pytest.fixture
752758
def some_filenames():
753759
return (
754760
'.pre-commit-hooks.yaml',
755-
'im_a_file_that_doesnt_exist.py',
756761
'pre_commit/git.py',
757762
'pre_commit/main.py',
758763
)
759764

760765

761766
def test_include_exclude_base_case(some_filenames):
762-
ret = _filter_by_include_exclude(some_filenames, '', '^$')
767+
ret = filter_by_include_exclude(some_filenames, '', '^$')
763768
assert ret == [
764769
'.pre-commit-hooks.yaml',
765770
'pre_commit/git.py',
@@ -771,22 +776,22 @@ def test_include_exclude_base_case(some_filenames):
771776
def test_matches_broken_symlink(tmpdir):
772777
with tmpdir.as_cwd():
773778
os.symlink('does-not-exist', 'link')
774-
ret = _filter_by_include_exclude({'link'}, '', '^$')
779+
ret = filter_by_include_exclude({'link'}, '', '^$')
775780
assert ret == ['link']
776781

777782

778783
def test_include_exclude_total_match(some_filenames):
779-
ret = _filter_by_include_exclude(some_filenames, r'^.*\.py$', '^$')
784+
ret = filter_by_include_exclude(some_filenames, r'^.*\.py$', '^$')
780785
assert ret == ['pre_commit/git.py', 'pre_commit/main.py']
781786

782787

783788
def test_include_exclude_does_search_instead_of_match(some_filenames):
784-
ret = _filter_by_include_exclude(some_filenames, r'\.yaml$', '^$')
789+
ret = filter_by_include_exclude(some_filenames, r'\.yaml$', '^$')
785790
assert ret == ['.pre-commit-hooks.yaml']
786791

787792

788793
def test_include_exclude_exclude_removes_files(some_filenames):
789-
ret = _filter_by_include_exclude(some_filenames, '', r'\.py$')
794+
ret = filter_by_include_exclude(some_filenames, '', r'\.py$')
790795
assert ret == ['.pre-commit-hooks.yaml']
791796

792797

tests/util_test.py

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
from __future__ import unicode_literals
22

33
import os.path
4-
import random
54

65
import pytest
76

87
from pre_commit.util import CalledProcessError
98
from pre_commit.util import clean_path_on_failure
109
from pre_commit.util import cmd_output
11-
from pre_commit.util import memoize_by_cwd
1210
from pre_commit.util import parse_version
1311
from pre_commit.util import tmpdir
14-
from testing.util import cwd
1512

1613

1714
def test_CalledProcessError_str():
@@ -42,37 +39,6 @@ def test_CalledProcessError_str_nooutput():
4239
)
4340

4441

45-
@pytest.fixture
46-
def memoized_by_cwd():
47-
@memoize_by_cwd
48-
def func(arg):
49-
return arg + str(random.getrandbits(64))
50-
51-
return func
52-
53-
54-
def test_memoized_by_cwd_returns_same_twice_in_a_row(memoized_by_cwd):
55-
ret = memoized_by_cwd('baz')
56-
ret2 = memoized_by_cwd('baz')
57-
assert ret is ret2
58-
59-
60-
def test_memoized_by_cwd_returns_different_for_different_args(memoized_by_cwd):
61-
ret = memoized_by_cwd('baz')
62-
ret2 = memoized_by_cwd('bar')
63-
assert ret.startswith('baz')
64-
assert ret2.startswith('bar')
65-
assert ret != ret2
66-
67-
68-
def test_memoized_by_cwd_changes_with_different_cwd(memoized_by_cwd):
69-
ret = memoized_by_cwd('baz')
70-
with cwd('.git'):
71-
ret2 = memoized_by_cwd('baz')
72-
73-
assert ret != ret2
74-
75-
7642
def test_clean_on_failure_noop(in_tmpdir):
7743
with clean_path_on_failure('foo'):
7844
pass

0 commit comments

Comments
 (0)