From c24fa39d860ef169dfac51bf63cded742334b0a8 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 May 2026 19:36:12 -0700 Subject: [PATCH 1/9] fix: bound DNSCache record count to prevent unbounded LAN-driven growth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #1715. A LAN-local peer multicasting unique-name mDNS responses could grow ``DNSCache.cache`` / ``_expirations`` / ``service_cache`` / ``_expire_heap`` without bound. ``_DNS_PTR_MIN_TTL = 1125`` (RFC 6762 recommendation for ServiceBrowser refresh-floor) actually extends attacker-injected PTR records to ~19 min before they expire naturally, and ``async_expire`` only runs every ``_CACHE_CLEANUP_INTERVAL = 10`` s — too slow to keep up with a sustained flood. On memory-constrained deployments (Home Assistant on a Raspberry Pi is the canonical victim) this trivially OOMs the process. Add a hard cap (``_MAX_CACHE_RECORDS = 10_000`` in ``const.py``) on the total number of records the cache will hold. ``DNSCache`` now tracks ``_total_records`` (incremented on genuinely-new inserts in ``_async_add``, decremented in ``_async_remove``). When the cap is hit, ``_async_evict_oldest`` heappops the closest-to-expiration record and removes it before inserting the new one. Reuses the existing ``_expire_heap`` for victim selection — O(log n) — and skips stale heap entries via the same expiration-equality check ``async_expire`` already uses. All four touched lines compile to direct C int ops in the Cython build (score-0 in ``cython -a`` annotation); the eviction call is a C-level vtable dispatch fired only on overflow. Wall-clock check: below-cap add stays at ~160 ns/record; over-cap add with constant eviction is ~195 ns/record (+36 ns for the heappop + cache delete). Per-source-IP record quotas — the reporter's "Better" suggestion — are deferred to a follow-up PR. This commit closes the unbounded- growth bug class on its own; the quota work changes the fairness properties but not the memory bound. CWE-400 (Uncontrolled Resource Consumption). LAN-local attack surface only. --- src/zeroconf/_cache.pxd | 8 +++- src/zeroconf/_cache.py | 31 ++++++++++++++- src/zeroconf/const.py | 6 +++ tests/benchmarks/test_cache_bound.py | 44 +++++++++++++++++++++ tests/test_cache.py | 57 ++++++++++++++++++++++++++++ 5 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 tests/benchmarks/test_cache_bound.py diff --git a/src/zeroconf/_cache.pxd b/src/zeroconf/_cache.pxd index 05a40c0f..fa1942c5 100644 --- a/src/zeroconf/_cache.pxd +++ b/src/zeroconf/_cache.pxd @@ -19,6 +19,7 @@ cdef object _UNIQUE_RECORD_TYPES cdef unsigned int _TYPE_PTR cdef cython.uint _ONE_SECOND cdef unsigned int _MIN_SCHEDULED_RECORD_EXPIRATION +cdef unsigned int _MAX_CACHE_RECORDS @cython.locals(record_cache=dict) @@ -31,6 +32,7 @@ cdef class DNSCache: cdef public cython.dict service_cache cdef public list _expire_heap cdef public dict _expirations + cdef public unsigned int _total_records cpdef bint async_add_records(self, object entries) @@ -60,10 +62,14 @@ cdef class DNSCache: service_store=cython.dict, service_record=DNSService, when=object, - new=bint + new=bint, + is_new=bint ) cdef bint _async_add(self, DNSRecord record) + @cython.locals(record=DNSRecord, when_record=tuple) + cdef void _async_evict_oldest(self) + @cython.locals(service_record=DNSService) cdef void _async_remove(self, DNSRecord record) diff --git a/src/zeroconf/_cache.py b/src/zeroconf/_cache.py index 94af3169..c523becc 100644 --- a/src/zeroconf/_cache.py +++ b/src/zeroconf/_cache.py @@ -37,7 +37,7 @@ DNSText, ) from ._utils.time import current_time_millis -from .const import _ONE_SECOND, _TYPE_PTR +from .const import _MAX_CACHE_RECORDS, _ONE_SECOND, _TYPE_PTR _UNIQUE_RECORD_TYPES = (DNSAddress, DNSHinfo, DNSPointer, DNSText, DNSService) _UniqueRecordsType = DNSAddress | DNSHinfo | DNSPointer | DNSText | DNSService @@ -72,6 +72,7 @@ def __init__(self) -> None: self._expire_heap: list[tuple[float, DNSRecord]] = [] self._expirations: dict[DNSRecord, float] = {} self.service_cache: _DNSRecordCacheType = {} + self._total_records: int = 0 # Functions prefixed with async_ are NOT threadsafe and must # be run in the event loop. @@ -91,7 +92,16 @@ def _async_add(self, record: _DNSRecord) -> bool: # direction would return the old incorrect entry. if (store := self.cache.get(record.key)) is None: store = self.cache[record.key] = {} - new = record not in store and not isinstance(record, DNSNsec) + is_new = record not in store + # Bound total cache size; evict closest-to-expiration entry to + # make room before inserting a new record. Prevents a LAN-local + # flood of unique-name records from growing the cache without + # bound (RFC 6762 §10 advisory caching, defense-in-depth). + if is_new and self._total_records >= _MAX_CACHE_RECORDS: + self._async_evict_oldest() + new = is_new and not isinstance(record, DNSNsec) + if is_new: + self._total_records += 1 store[record] = record when = record.created + (record.ttl * 1000) if self._expirations.get(record) != when: @@ -106,6 +116,22 @@ def _async_add(self, record: _DNSRecord) -> bool: service_store[service_record] = service_record return new + def _async_evict_oldest(self) -> None: + """Drop the closest-to-expiration record to make room for a new one. + + Skips stale heap entries (records re-added with a different TTL), + which mirrors the staleness check in ``async_expire``. + + This function must be run in from event loop. + """ + while self._expire_heap: + when_record = heappop(self._expire_heap) + record = when_record[1] + if self._expirations.get(record) != when_record[0]: + continue + self._async_remove(record) + return + def async_add_records(self, entries: Iterable[DNSRecord]) -> bool: """Add multiple records. @@ -129,6 +155,7 @@ def _async_remove(self, record: _DNSRecord) -> None: _remove_key(self.service_cache, service_record.server_key, service_record) _remove_key(self.cache, record.key, record) self._expirations.pop(record, None) + self._total_records -= 1 def async_remove_records(self, entries: Iterable[DNSRecord]) -> None: """Remove multiple records. diff --git a/src/zeroconf/const.py b/src/zeroconf/const.py index 1db39a46..a17e4685 100644 --- a/src/zeroconf/const.py +++ b/src/zeroconf/const.py @@ -59,6 +59,12 @@ # level of rate limit and safe guards so we use 1/4 of the recommended value _DNS_PTR_MIN_TTL = 1125 +# Upper bound on the number of records the DNSCache will hold before it +# starts evicting the closest-to-expiration entry to make room for new +# arrivals. Bounds the memory a malicious LAN peer can force the cache +# to retain by multicasting many unique-name records. +_MAX_CACHE_RECORDS = 10000 + _DNS_PACKET_HEADER_LEN = 12 _MAX_MSG_TYPICAL = 1460 # unused diff --git a/tests/benchmarks/test_cache_bound.py b/tests/benchmarks/test_cache_bound.py new file mode 100644 index 00000000..0ccb6bdd --- /dev/null +++ b/tests/benchmarks/test_cache_bound.py @@ -0,0 +1,44 @@ +"""Benchmark for the DNSCache record-count bound + overflow eviction.""" + +from __future__ import annotations + +from pytest_codspeed import BenchmarkFixture + +from zeroconf import DNSAddress, DNSCache, current_time_millis +from zeroconf.const import _CLASS_IN, _MAX_CACHE_RECORDS, _TYPE_A + + +def _make_records(count: int, now: float) -> list[DNSAddress]: + return [ + DNSAddress( + f"bench-{i}.local.", + _TYPE_A, + _CLASS_IN, + 120, + bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), + created=now + i, + ) + for i in range(count) + ] + + +def test_cache_add_below_cap(benchmark: BenchmarkFixture) -> None: + """Adding records while the cache is well below the cap (no eviction).""" + now = current_time_millis() + records = _make_records(1000, now) + + @benchmark + def _add() -> None: + cache = DNSCache() + cache.async_add_records(records) + + +def test_cache_add_at_cap_evicts(benchmark: BenchmarkFixture) -> None: + """Steady-state add at the cap: every new record forces one eviction.""" + now = current_time_millis() + overflow_records = _make_records(_MAX_CACHE_RECORDS + 1000, now) + + @benchmark + def _flood() -> None: + cache = DNSCache() + cache.async_add_records(overflow_records) diff --git a/tests/test_cache.py b/tests/test_cache.py index 9d55435d..a2ef9ee2 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -482,3 +482,60 @@ async def test_cache_heap_pops_order() -> None: ts, _ = heappop(cache._expire_heap) assert ts >= start_ts start_ts = ts + + +def test_cache_size_is_bounded() -> None: + """A flood of unique-name records is capped at ``_MAX_CACHE_RECORDS``.""" + cache = r.DNSCache() + now = r.current_time_millis() + overflow = 1000 + flood_size = const._MAX_CACHE_RECORDS + overflow + + cache.async_add_records( + r.DNSAddress( + f"flood-{i}.local.", + const._TYPE_A, + const._CLASS_IN, + 120, + bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), + created=now + i, + ) + for i in range(flood_size) + ) + + total = sum(len(store) for store in cache.cache.values()) + assert total == const._MAX_CACHE_RECORDS + assert cache._total_records == const._MAX_CACHE_RECORDS + # FIFO-ish: the earliest-created records (closest to expiration) get + # evicted first, so the names that remain are from the tail. + for i in range(overflow): + assert f"flood-{i}.local." not in cache.cache + for i in range(flood_size - overflow, flood_size): + assert f"flood-{i}.local." in cache.cache + + +def test_cache_eviction_decrements_total_records() -> None: + """Natural removal (goodbyes, expirations) must keep ``_total_records`` in sync.""" + cache = r.DNSCache() + now = r.current_time_millis() + records = [ + r.DNSAddress( + f"sync-{i}.local.", + const._TYPE_A, + const._CLASS_IN, + 120, + bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), + created=now, + ) + for i in range(50) + ] + cache.async_add_records(records) + assert cache._total_records == 50 + + cache.async_remove_records(records[:20]) + assert cache._total_records == 30 + + # async_expire on a future time should drop the rest + cache.async_expire(now + (200 * 1000)) + assert cache._total_records == 0 + assert not cache.cache From 51fbc279a0f6e434debbba196321c608c781a805 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 May 2026 19:42:17 -0700 Subject: [PATCH 2/9] test: cover stale-heap-entry branch in _async_evict_oldest Adds ``test_cache_eviction_skips_stale_heap_entries`` which fills the cache to ``_MAX_CACHE_RECORDS``, re-adds the closest-to-expiration record with a longer TTL (which leaves a stale ``(when, record)`` tuple at the head of ``_expire_heap`` and updates ``_expirations`` with the new ``when``), then forces a single eviction and asserts the refreshed record survives while the next-oldest legitimate record is the one that gets evicted. Without this test the ``continue`` branch inside ``_async_evict_oldest`` would be untouched by the suite. --- tests/test_cache.py | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/test_cache.py b/tests/test_cache.py index a2ef9ee2..ecb59599 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -514,6 +514,59 @@ def test_cache_size_is_bounded() -> None: assert f"flood-{i}.local." in cache.cache +def test_cache_eviction_skips_stale_heap_entries() -> None: + """Eviction must skip stale ``_expire_heap`` entries left by TTL re-adds.""" + cache = r.DNSCache() + now = r.current_time_millis() + # Fill to one shy of the cap so a single add triggers exactly one eviction + cache.async_add_records( + r.DNSAddress( + f"stale-{i}.local.", + const._TYPE_A, + const._CLASS_IN, + 120, + bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), + created=now + i, + ) + for i in range(const._MAX_CACHE_RECORDS) + ) + assert cache._total_records == const._MAX_CACHE_RECORDS + + # Re-add the closest-to-expiration record with a longer TTL. The original + # (when, record) tuple stays at the head of _expire_heap as a stale entry; + # _expirations[record] now points to the new (later) expiration. Eviction + # must pop and skip that stale tuple before settling on a real victim. + victim_name = "stale-0.local." + refreshed = r.DNSAddress( + victim_name, + const._TYPE_A, + const._CLASS_IN, + 7200, + bytes((0, 0, 0, 1)), + created=now + 0, + ) + cache.async_add_records([refreshed]) + assert cache._total_records == const._MAX_CACHE_RECORDS + + # Force one eviction. The next-oldest legitimate record must go; the + # refreshed record stays alive. + cache.async_add_records( + [ + r.DNSAddress( + "trigger.local.", + const._TYPE_A, + const._CLASS_IN, + 120, + b"\xff\xff\xff\xff", + created=now + const._MAX_CACHE_RECORDS, + ) + ] + ) + assert cache._total_records == const._MAX_CACHE_RECORDS + assert victim_name in cache.cache + assert "stale-1.local." not in cache.cache + + def test_cache_eviction_decrements_total_records() -> None: """Natural removal (goodbyes, expirations) must keep ``_total_records`` in sync.""" cache = r.DNSCache() From 931358ab6c6fcfb369d9a5bef0303dfab3786228 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 May 2026 20:05:05 -0700 Subject: [PATCH 3/9] fix: re-resolve store after eviction and rebuild stale-prefix heap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback on #1718. **Orphaned-store bug (blocking).** ``_async_add`` was capturing ``store = self.cache.get(record.key)`` before calling ``_async_evict_oldest``. If the closest-to-expiration victim was the last record under ``record.key`` (e.g. the incoming record shares its key with the victim — A+AAAA pairs, PTR fan-outs, TXT/SRV pairs all hit this in real workloads), ``_remove_key`` would ``del self.cache[record.key]`` during eviction, leaving the captured ``store`` pointing at an orphaned dict. The subsequent ``store[record] = record`` then wrote into a dict no longer reachable through the cache; the new record was effectively lost, and a later ``async_expire`` of that record would ``KeyError`` from ``_remove_key``. Defer the bucket creation: look up ``store`` first (without creating), do the eviction-if-needed pass, then ``cache.get`` again to pick up any deletion eviction caused. The bucket is only created with ``self.cache[record.key] = {}`` after we know eviction has settled. **Stale-prefix scan (suggestion #1).** Under sustained TTL re-adds — e.g. ``async_mark_unique_records_older_than_1s_to_expire`` rewriting TTLs to 1 — ``_expire_heap`` accumulates stale ``(when, record)`` tuples that no longer match ``_expirations[record]``. The previous eviction body would pop those one at a time, doing O(stale_prefix * log n) work per ``_async_add``. Reuse the same ``len(heap) > _MIN_SCHEDULED_RECORD_EXPIRATION and len(heap) > len(expirations) * 2`` heuristic that ``async_expire`` already runs: when the heap is mostly stale, rebuild it once up front, then run the pop loop against only live entries. **Accounting invariant test.** ``_total_records`` is ``cdef public unsigned int``; an off-by-one in ``_async_add`` / ``_async_remove`` would silently underflow in the Cython wheel and pin the cap-check True forever (eviction storm on every add). Rather than defensively zero-clamp the decrement — which would hide the bug — the new ``test_cache_total_records_invariant_under_mixed_ops`` walks the counter through every code path that touches it (fresh inserts, re-adds, DNSService, DNSNsec, shared-key inserts that empty their bucket on removal, full-cap eviction loop, ``async_expire``, ``async_remove_records``) and asserts ``cache._total_records == sum(len(s) for s in cache.cache.values())`` after every step. Any future change that misses an increment or doubles a decrement fails loudly. Regression tests: - ``test_cache_eviction_victim_shares_key_with_new_record`` — reproduces the orphan; would have failed against c24fa39. - ``test_cache_eviction_rebuilds_heap_when_mostly_stale`` — fills to cap then re-adds with two distinct TTLs to push the heap to ~3x MAX, then triggers eviction and asserts the rebuild ran (heap and expirations match after the pass). - ``test_cache_total_records_invariant_under_mixed_ops`` — counter invariant across all touched code paths. The ``.pxd`` adds ``expire_heap_len`` as ``unsigned int`` to ``_async_evict_oldest``'s locals so the new threshold comparison stays a direct C int op in the Cython build. --- src/zeroconf/_cache.pxd | 2 +- src/zeroconf/_cache.py | 26 ++++- tests/test_cache.py | 214 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 237 insertions(+), 5 deletions(-) diff --git a/src/zeroconf/_cache.pxd b/src/zeroconf/_cache.pxd index fa1942c5..b94131c3 100644 --- a/src/zeroconf/_cache.pxd +++ b/src/zeroconf/_cache.pxd @@ -67,7 +67,7 @@ cdef class DNSCache: ) cdef bint _async_add(self, DNSRecord record) - @cython.locals(record=DNSRecord, when_record=tuple) + @cython.locals(record=DNSRecord, when_record=tuple, expire_heap_len="unsigned int") cdef void _async_evict_oldest(self) @cython.locals(service_record=DNSService) diff --git a/src/zeroconf/_cache.py b/src/zeroconf/_cache.py index c523becc..353117f3 100644 --- a/src/zeroconf/_cache.py +++ b/src/zeroconf/_cache.py @@ -90,15 +90,20 @@ def _async_add(self, record: _DNSRecord) -> bool: # replaces any existing records that are __eq__ to each other which # removes the risk that accessing the cache from the wrong # direction would return the old incorrect entry. - if (store := self.cache.get(record.key)) is None: - store = self.cache[record.key] = {} - is_new = record not in store + store = self.cache.get(record.key) + is_new = store is None or record not in store # Bound total cache size; evict closest-to-expiration entry to # make room before inserting a new record. Prevents a LAN-local # flood of unique-name records from growing the cache without # bound (RFC 6762 §10 advisory caching, defense-in-depth). if is_new and self._total_records >= _MAX_CACHE_RECORDS: self._async_evict_oldest() + # The victim may have been the last record under + # ``record.key``, in which case ``_remove_key`` deleted + # the bucket. Re-fetch before creating below. + store = self.cache.get(record.key) + if store is None: + store = self.cache[record.key] = {} new = is_new and not isinstance(record, DNSNsec) if is_new: self._total_records += 1 @@ -120,10 +125,23 @@ def _async_evict_oldest(self) -> None: """Drop the closest-to-expiration record to make room for a new one. Skips stale heap entries (records re-added with a different TTL), - which mirrors the staleness check in ``async_expire``. + which mirrors the staleness check in ``async_expire``. If the + heap is mostly stale (long stale prefix from sustained TTL + re-adds), rebuild it once up front so the pop loop below + doesn't do O(stale_prefix * log n) work on a single add. Same + heuristic / threshold as ``async_expire``. This function must be run in from event loop. """ + expire_heap_len = len(self._expire_heap) + if ( + expire_heap_len > _MIN_SCHEDULED_RECORD_EXPIRATION + and expire_heap_len > len(self._expirations) * 2 + ): + self._expire_heap = [ + entry for entry in self._expire_heap if self._expirations.get(entry[1]) == entry[0] + ] + heapify(self._expire_heap) while self._expire_heap: when_record = heappop(self._expire_heap) record = when_record[1] diff --git a/tests/test_cache.py b/tests/test_cache.py index ecb59599..80607ffd 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -567,6 +567,128 @@ def test_cache_eviction_skips_stale_heap_entries() -> None: assert "stale-1.local." not in cache.cache +def test_cache_eviction_victim_shares_key_with_new_record() -> None: + """Eviction must not orphan the new record when it collides on ``record.key``. + + If the closest-to-expiration record is the *last* one stored under + ``record.key`` and the incoming record uses the same key, + ``_remove_key`` deletes ``self.cache[record.key]`` during eviction. + A previous capture of ``store = self.cache.get(record.key)`` would + then write the new record into an orphaned dict not reachable via + the cache. Pin that the new record is reachable. + """ + cache = r.DNSCache() + now = r.current_time_millis() + # Fill the cache to one shy of the cap with unique-name records, each + # with a later created time than the shared-key victim below. + cache.async_add_records( + r.DNSAddress( + f"filler-{i}.local.", + const._TYPE_A, + const._CLASS_IN, + 120, + bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), + created=now + 1000 + i, + ) + for i in range(const._MAX_CACHE_RECORDS - 1) + ) + + # Insert a record at "shared.local." with the earliest expiration — + # this is the one closest_to_expiration eviction will pick. + shared_key = "shared.local." + old_shared = r.DNSAddress( + shared_key, + const._TYPE_A, + const._CLASS_IN, + 120, + b"\x01\x02\x03\x04", + created=now, + ) + cache.async_add_records([old_shared]) + assert cache._total_records == const._MAX_CACHE_RECORDS + assert shared_key in cache.cache + + # Add a new record with the SAME key. Eviction will remove old_shared, + # which empties cache[shared_key], so _remove_key deletes that bucket. + # If _async_add captured the store before eviction it will now be + # writing into an orphaned dict. + new_shared = r.DNSAddress( + shared_key, + const._TYPE_A, + const._CLASS_IN, + 120, + b"\x05\x06\x07\x08", + created=now + 999, + ) + cache.async_add_records([new_shared]) + + # The new record must be reachable via the cache. + assert shared_key in cache.cache, "new record orphaned: cache bucket missing" + assert new_shared in cache.cache[shared_key] + assert cache.async_get_unique(new_shared) == new_shared + # And the counter accounting must still match observable state. + total = sum(len(store) for store in cache.cache.values()) + assert total == cache._total_records + + +def test_cache_eviction_rebuilds_heap_when_mostly_stale() -> None: + """Eviction rebuilds ``_expire_heap`` up front when stale entries dominate. + + Without the rebuild, a single ``_async_add`` at cap could pop a long + stale prefix one entry at a time — O(stale_prefix * log n). The + rebuild is the same heuristic ``async_expire`` already uses: when + ``len(heap) > 2 * len(expirations)``, the heap is reconstructed + from only the valid entries in one O(n) pass before the pop loop. + """ + cache = r.DNSCache() + now = r.current_time_millis() + base_kwargs = {"type_": const._TYPE_A, "class_": const._CLASS_IN} + + cache.async_add_records( + r.DNSAddress( + f"r-{i}.local.", + ttl=120, + address=bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), + created=now + i, + **base_kwargs, + ) + for i in range(const._MAX_CACHE_RECORDS) + ) + # Two re-adds with distinct TTLs leave two stale entries per record + # in ``_expire_heap`` while ``_expirations`` stays at MAX. Heap reaches + # ~3x MAX, which trips the ``len(heap) > 2 * len(expirations)`` + # threshold on the next eviction. + for ttl in (7200, 14400): + cache.async_add_records( + r.DNSAddress( + f"r-{i}.local.", + ttl=ttl, + address=bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), + created=now + i, + **base_kwargs, + ) + for i in range(const._MAX_CACHE_RECORDS) + ) + assert len(cache._expire_heap) > 2 * len(cache._expirations) + + # One more insert pushes us over the cap; eviction fires and must + # rebuild before scanning. + cache.async_add_records( + [ + r.DNSAddress( + "trigger.local.", + ttl=120, + address=b"\xff\xff\xff\xff", + created=now + const._MAX_CACHE_RECORDS, + **base_kwargs, + ) + ] + ) + # Rebuild dropped the stale entries; heap and expirations now match. + assert len(cache._expire_heap) == len(cache._expirations) == const._MAX_CACHE_RECORDS + assert cache._total_records == const._MAX_CACHE_RECORDS + + def test_cache_eviction_decrements_total_records() -> None: """Natural removal (goodbyes, expirations) must keep ``_total_records`` in sync.""" cache = r.DNSCache() @@ -592,3 +714,95 @@ def test_cache_eviction_decrements_total_records() -> None: cache.async_expire(now + (200 * 1000)) assert cache._total_records == 0 assert not cache.cache + + +def test_cache_total_records_invariant_under_mixed_ops() -> None: + """``_total_records`` must equal ``sum(len(store) for store in cache.values())``. + + Walks the counter through every code path that touches it — new inserts, + re-adds of an existing record (no increment), DNSService (extra + service_cache write), DNSNsec (stored but not counted as "new" by the + flag), shared-key inserts that empty their bucket on removal, full-cap + eviction, async_expire, manual async_remove_records — and asserts the + invariant after every step. Any future change that misses an increment + or doubles a decrement will trip this immediately; ``_total_records`` is + ``cdef unsigned int`` in the Cython build, so silent underflow would + otherwise propagate as a permanent eviction-on-every-add storm. + """ + cache = r.DNSCache() + now = r.current_time_millis() + + def actual() -> int: + return sum(len(store) for store in cache.cache.values()) + + # Fresh inserts: counter follows insert count exactly. + addrs = [ + r.DNSAddress( + f"mix-{i}.local.", const._TYPE_A, const._CLASS_IN, 120, bytes((i, 0, 0, 1)), created=now + i + ) + for i in range(20) + ] + cache.async_add_records(addrs) + assert cache._total_records == actual() == 20 + + # Re-add of an identical record: no increment (already present). + cache.async_add_records([addrs[0]]) + assert cache._total_records == actual() == 20 + + # DNSService writes service_cache too — counter must still match cache size. + svc = r.DNSService("svc.local.", const._TYPE_SRV, const._CLASS_IN, 120, 0, 0, 80, "host.local.") + cache.async_add_records([svc]) + assert cache._total_records == actual() == 21 + cache.async_remove_records([svc]) + assert cache._total_records == actual() == 20 + + # DNSNsec is stored but excluded from the "new" return value; the counter + # still tracks it because it occupies a bucket slot. + nsec = r.DNSNsec( + "nsec.local.", + const._TYPE_NSEC, + const._CLASS_IN, + 120, + "nsec.local.", + [const._TYPE_A], + ) + cache.async_add_records([nsec]) + assert cache._total_records == actual() == 21 + cache.async_remove_records([nsec]) + assert cache._total_records == actual() == 20 + + # Shared-key insert/remove: emptying a bucket removes the cache key, but + # the counter must still decrement only by the number of records gone. + shared_a = r.DNSAddress( + "shared.local.", const._TYPE_A, const._CLASS_IN, 120, b"\x01\x01\x01\x01", created=now + ) + shared_b = r.DNSAddress( + "shared.local.", const._TYPE_A, const._CLASS_IN, 120, b"\x02\x02\x02\x02", created=now + ) + cache.async_add_records([shared_a, shared_b]) + assert cache._total_records == actual() == 22 + cache.async_remove_records([shared_a, shared_b]) + assert cache._total_records == actual() == 20 + assert "shared.local." not in cache.cache + + # async_expire path: counter must follow the records dropped. + cache.async_expire(now + (200 * 1000)) + assert cache._total_records == actual() == 0 + assert not cache.cache + + # Full-cap eviction loop: counter never grows past the cap, never drifts. + cap_records = [ + r.DNSAddress( + f"cap-{i}.local.", + const._TYPE_A, + const._CLASS_IN, + 120, + bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), + created=now + i, + ) + for i in range(const._MAX_CACHE_RECORDS + 50) + ] + for rec in cap_records: + cache.async_add_records([rec]) + assert cache._total_records == actual() + assert cache._total_records == const._MAX_CACHE_RECORDS From 7c6be63a9fc8134bddbdf2e1e2821e5107cb36e6 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 May 2026 20:48:14 -0700 Subject: [PATCH 4/9] fix: bound _expire_heap growth from re-adds and DRY test helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address remaining review feedback on #1718. **Re-add heap growth (Copilot inline at L100, real bug separate from the orphan).** ``_async_add``'s cap check is gated on ``is_new``, but a peer that just replays cached records with a different ``created``/ TTL hits the ``is_new == False`` path: no cap check, no ``_total_records`` change — yet the changed ``when`` makes ``_expirations.get(record) != when`` true, so a fresh ``(when, record)`` tuple is heappushed while the prior tuple stays behind as stale. ``_expire_heap`` grows unbounded between cleanups even though ``cache`` and ``_total_records`` stay flat. At ~10k pps for 10 s that's ~1 M stale heap entries × ~100 B ≈ 100 MB transient. Factor the rebuild check (``len(heap) > _MIN_SCHEDULED_RECORD_EXPIRATION and len(heap) > 2 * len(expirations)`` → filter-and-heapify) into a new ``_maybe_rebuild_heap`` cdef helper. Call it from three sites: - ``_async_add`` immediately after the conditional heappush, so a re-add flood is bounded at every step (amortized O(1) per push, O(N) rebuild fires at most once per N stale pushes) - ``async_expire`` after the pop loop, replacing the existing inline rebuild — same behavior, slightly more aggressive (uses post-pop heap length instead of pre-pop), no test regression - (removed from ``_async_evict_oldest`` — the eager rebuild in ``_async_add`` keeps the heap bounded before eviction is ever triggered, so the eviction-side call was dead code) Cython codegen: the rebuild call from ``_async_add`` only fires when the conditional heappush already fired, so the hot path on duplicate re-adds (same ``when``) is unchanged. The check inside ``_maybe_rebuild_heap`` is two ``len()`` calls + an integer compare — score-0 in ``cython -a``. **Test DRY-up.** Introduce a module-level ``_addr(name, idx, ...)`` helper that builds the boilerplate ``DNSAddress(name, _TYPE_A, _CLASS_IN, ttl, bytes((idx & 0xFF, (idx >> 8) & 0xFF, 0, 1)), created =...)`` so each new test reads top-down instead of being dominated by constructor noise. Test docstrings collapsed to one sentence per ``CLAUDE.md``. **New regression test:** ``test_cache_re_add_flood_does_not_grow_heap_unbounded`` adds 200 records below the cap, then replays them through 10 cycles of varying TTLs (2000 stale pushes total) and asserts the heap stays near the rebuild threshold. The previous code would have grown it to ~11x the record count; the helper keeps it within ``2 * expirations``. **Removed test:** ``test_cache_eviction_rebuilds_heap_when_mostly_stale`` — its premise (build up stale entries, then trigger eviction-side rebuild) is no longer reachable now that ``_async_add`` rebuilds proactively. The same code path is covered by the re-add flood test. **Existing tests updated:** ``test_cache_heap_multi_name_cleanup`` and ``test_cache_heap_pops_order`` previously asserted pre-cleanup heap size (``min_records_to_cleanup + 5``). With proactive rebuild firing inside ``_async_add``, the heap is cleaned up earlier and the post- cleanup invariant (``1 + 5`` after ``async_expire``) still holds, but the intermediate snapshot is no longer accurate. Replaced the stale intermediate assertion with a comment. --- src/zeroconf/_cache.pxd | 5 +- src/zeroconf/_cache.py | 72 +++++------ tests/test_cache.py | 277 ++++++++++------------------------------ 3 files changed, 99 insertions(+), 255 deletions(-) diff --git a/src/zeroconf/_cache.pxd b/src/zeroconf/_cache.pxd index b94131c3..023304bc 100644 --- a/src/zeroconf/_cache.pxd +++ b/src/zeroconf/_cache.pxd @@ -67,9 +67,12 @@ cdef class DNSCache: ) cdef bint _async_add(self, DNSRecord record) - @cython.locals(record=DNSRecord, when_record=tuple, expire_heap_len="unsigned int") + @cython.locals(record=DNSRecord, when_record=tuple) cdef void _async_evict_oldest(self) + @cython.locals(expire_heap_len="unsigned int") + cdef void _maybe_rebuild_heap(self) + @cython.locals(service_record=DNSService) cdef void _async_remove(self, DNSRecord record) diff --git a/src/zeroconf/_cache.py b/src/zeroconf/_cache.py index 353117f3..3c45b733 100644 --- a/src/zeroconf/_cache.py +++ b/src/zeroconf/_cache.py @@ -110,9 +110,14 @@ def _async_add(self, record: _DNSRecord) -> bool: store[record] = record when = record.created + (record.ttl * 1000) if self._expirations.get(record) != when: - # Avoid adding duplicates to the heap heappush(self._expire_heap, (when, record)) self._expirations[record] = when + # Re-adds of an existing record with a new TTL push a fresh + # entry but leave the prior tuple behind as stale, so a peer + # that just replays cached records can grow ``_expire_heap`` + # without ever tripping the cap. Rebuild when stale entries + # dominate. + self._maybe_rebuild_heap() if isinstance(record, DNSService): service_record = record @@ -122,16 +127,28 @@ def _async_add(self, record: _DNSRecord) -> bool: return new def _async_evict_oldest(self) -> None: - """Drop the closest-to-expiration record to make room for a new one. - - Skips stale heap entries (records re-added with a different TTL), - which mirrors the staleness check in ``async_expire``. If the - heap is mostly stale (long stale prefix from sustained TTL - re-adds), rebuild it once up front so the pop loop below - doesn't do O(stale_prefix * log n) work on a single add. Same - heuristic / threshold as ``async_expire``. + """Drop the closest-to-expiration record to make room for a new one.""" + while self._expire_heap: + when_record = heappop(self._expire_heap) + record = when_record[1] + if self._expirations.get(record) != when_record[0]: + continue + self._async_remove(record) + return - This function must be run in from event loop. + def _maybe_rebuild_heap(self) -> None: + """Rebuild ``_expire_heap`` if stale entries dominate. + + Re-adds of an existing record with a new TTL append a fresh + ``(when, record)`` and leave the prior tuple behind as stale; + eviction's stale-skip loop and ``async_expire`` already absorb + these, but unchecked accumulation lets a peer that just replays + cached records grow the heap arbitrarily between cleanups. + Same threshold as the long-standing rebuild in ``async_expire``: + only fire when stale entries outweigh live ones (heap > 2 x + expirations), and only above a minimum floor so a small cache + isn't rebuilt for nothing. Amortized cost is O(1) per push; + the O(N) rebuild fires at most once per N stale pushes. """ expire_heap_len = len(self._expire_heap) if ( @@ -142,13 +159,6 @@ def _async_evict_oldest(self) -> None: entry for entry in self._expire_heap if self._expirations.get(entry[1]) == entry[0] ] heapify(self._expire_heap) - while self._expire_heap: - when_record = heappop(self._expire_heap) - record = when_record[1] - if self._expirations.get(record) != when_record[0]: - continue - self._async_remove(record) - return def async_add_records(self, entries: Iterable[DNSRecord]) -> bool: """Add multiple records. @@ -190,43 +200,23 @@ def async_expire(self, now: _float) -> list[DNSRecord]: :param now: The current time in milliseconds. """ - if not (expire_heap_len := len(self._expire_heap)): + if not self._expire_heap: return [] expired: list[DNSRecord] = [] - # Find any expired records and add them to the to-delete list while self._expire_heap: when_record = self._expire_heap[0] when = when_record[0] if when > now: break heappop(self._expire_heap) - # Check if the record hasn't been re-added to the heap - # with a different expiration time as it will be removed - # later when it reaches the top of the heap and its - # expiration time is met. + # Skip entries left behind by a TTL re-add; the live tuple is + # later in the heap and will be removed when it reaches the top. record = when_record[1] if self._expirations.get(record) == when: expired.append(record) - # If the expiration heap grows larger than the number expirations - # times two, we clean it up to avoid keeping expired entries in - # the heap and consuming memory. We guard this with a minimum - # threshold to avoid cleaning up the heap too often when there are - # only a few scheduled expirations. - if ( - expire_heap_len > _MIN_SCHEDULED_RECORD_EXPIRATION - and expire_heap_len > len(self._expirations) * 2 - ): - # Remove any expired entries from the expiration heap - # that do not match the expiration time in the expirations - # as it means the record has been re-added to the heap - # with a different expiration time. - self._expire_heap = [ - entry for entry in self._expire_heap if self._expirations.get(entry[1]) == entry[0] - ] - heapify(self._expire_heap) - + self._maybe_rebuild_heap() self.async_remove_records(expired) return expired diff --git a/tests/test_cache.py b/tests/test_cache.py index 80607ffd..a802d851 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -439,7 +439,9 @@ async def test_cache_heap_multi_name_cleanup() -> None: ) cache.async_add_records([record]) - assert len(cache._expire_heap) == min_records_to_cleanup + 5 + # ``_async_add`` rebuilds ``_expire_heap`` proactively when stale entries + # dominate (heap > 2x expirations), so the heap is already capped at + # ~one entry per unique record long before ``async_expire`` is called. assert len(cache.async_entries_with_name(name)) == 1 assert len(cache.async_entries_with_name(name2)) == 5 @@ -473,7 +475,8 @@ async def test_cache_heap_pops_order() -> None: ) cache.async_add_records([record]) - assert len(cache._expire_heap) == min_records_to_cleanup + 5 + # ``_async_add`` proactively rebuilds the heap when stale entries dominate, + # so the heap holds only one entry per unique record by this point. assert len(cache.async_entries_with_name(name)) == 1 assert len(cache.async_entries_with_name(name2)) == 5 @@ -484,6 +487,18 @@ async def test_cache_heap_pops_order() -> None: start_ts = ts +def _addr(name: str, idx: int, *, ttl: int = 120, created: float | None = None) -> r.DNSAddress: + """Build a DNSAddress with idx-derived payload for the bound/eviction tests.""" + return r.DNSAddress( + name, + const._TYPE_A, + const._CLASS_IN, + ttl, + bytes((idx & 0xFF, (idx >> 8) & 0xFF, 0, 1)), + created=r.current_time_millis() if created is None else created, + ) + + def test_cache_size_is_bounded() -> None: """A flood of unique-name records is capped at ``_MAX_CACHE_RECORDS``.""" cache = r.DNSCache() @@ -491,17 +506,7 @@ def test_cache_size_is_bounded() -> None: overflow = 1000 flood_size = const._MAX_CACHE_RECORDS + overflow - cache.async_add_records( - r.DNSAddress( - f"flood-{i}.local.", - const._TYPE_A, - const._CLASS_IN, - 120, - bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), - created=now + i, - ) - for i in range(flood_size) - ) + cache.async_add_records(_addr(f"flood-{i}.local.", i, created=now + i) for i in range(flood_size)) total = sum(len(store) for store in cache.cache.values()) assert total == const._MAX_CACHE_RECORDS @@ -515,293 +520,139 @@ def test_cache_size_is_bounded() -> None: def test_cache_eviction_skips_stale_heap_entries() -> None: - """Eviction must skip stale ``_expire_heap`` entries left by TTL re-adds.""" + """Eviction skips stale heap entries left by TTL re-adds.""" cache = r.DNSCache() now = r.current_time_millis() - # Fill to one shy of the cap so a single add triggers exactly one eviction cache.async_add_records( - r.DNSAddress( - f"stale-{i}.local.", - const._TYPE_A, - const._CLASS_IN, - 120, - bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), - created=now + i, - ) - for i in range(const._MAX_CACHE_RECORDS) + _addr(f"stale-{i}.local.", i, created=now + i) for i in range(const._MAX_CACHE_RECORDS) ) assert cache._total_records == const._MAX_CACHE_RECORDS - # Re-add the closest-to-expiration record with a longer TTL. The original - # (when, record) tuple stays at the head of _expire_heap as a stale entry; - # _expirations[record] now points to the new (later) expiration. Eviction - # must pop and skip that stale tuple before settling on a real victim. + # Re-add the closest-to-expiration record with a longer TTL; the prior + # ``(when, record)`` tuple stays as stale, eviction must skip it. victim_name = "stale-0.local." - refreshed = r.DNSAddress( - victim_name, - const._TYPE_A, - const._CLASS_IN, - 7200, - bytes((0, 0, 0, 1)), - created=now + 0, - ) - cache.async_add_records([refreshed]) + cache.async_add_records([_addr(victim_name, 0, ttl=7200, created=now)]) assert cache._total_records == const._MAX_CACHE_RECORDS - # Force one eviction. The next-oldest legitimate record must go; the - # refreshed record stays alive. - cache.async_add_records( - [ - r.DNSAddress( - "trigger.local.", - const._TYPE_A, - const._CLASS_IN, - 120, - b"\xff\xff\xff\xff", - created=now + const._MAX_CACHE_RECORDS, - ) - ] - ) + cache.async_add_records([_addr("trigger.local.", 0xFFFF, created=now + const._MAX_CACHE_RECORDS)]) assert cache._total_records == const._MAX_CACHE_RECORDS assert victim_name in cache.cache assert "stale-1.local." not in cache.cache def test_cache_eviction_victim_shares_key_with_new_record() -> None: - """Eviction must not orphan the new record when it collides on ``record.key``. - - If the closest-to-expiration record is the *last* one stored under - ``record.key`` and the incoming record uses the same key, - ``_remove_key`` deletes ``self.cache[record.key]`` during eviction. - A previous capture of ``store = self.cache.get(record.key)`` would - then write the new record into an orphaned dict not reachable via - the cache. Pin that the new record is reachable. - """ + """Inserting a record whose key collides with the eviction victim keeps it reachable.""" cache = r.DNSCache() now = r.current_time_millis() - # Fill the cache to one shy of the cap with unique-name records, each - # with a later created time than the shared-key victim below. cache.async_add_records( - r.DNSAddress( - f"filler-{i}.local.", - const._TYPE_A, - const._CLASS_IN, - 120, - bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), - created=now + 1000 + i, - ) - for i in range(const._MAX_CACHE_RECORDS - 1) + _addr(f"filler-{i}.local.", i, created=now + 1000 + i) for i in range(const._MAX_CACHE_RECORDS - 1) ) - # Insert a record at "shared.local." with the earliest expiration — - # this is the one closest_to_expiration eviction will pick. + # Insert at "shared.local." with the earliest expiration so eviction + # picks it. ``_remove_key`` then deletes ``cache["shared.local."]``. shared_key = "shared.local." - old_shared = r.DNSAddress( - shared_key, - const._TYPE_A, - const._CLASS_IN, - 120, - b"\x01\x02\x03\x04", - created=now, - ) - cache.async_add_records([old_shared]) + cache.async_add_records([_addr(shared_key, 0x0102, created=now)]) assert cache._total_records == const._MAX_CACHE_RECORDS - assert shared_key in cache.cache - - # Add a new record with the SAME key. Eviction will remove old_shared, - # which empties cache[shared_key], so _remove_key deletes that bucket. - # If _async_add captured the store before eviction it will now be - # writing into an orphaned dict. - new_shared = r.DNSAddress( - shared_key, - const._TYPE_A, - const._CLASS_IN, - 120, - b"\x05\x06\x07\x08", - created=now + 999, - ) + + # Adding a new record under the SAME key: a pre-eviction-captured + # ``store`` would write into an orphaned dict; the fix re-resolves. + new_shared = _addr(shared_key, 0x0506, created=now + 999) cache.async_add_records([new_shared]) - # The new record must be reachable via the cache. assert shared_key in cache.cache, "new record orphaned: cache bucket missing" assert new_shared in cache.cache[shared_key] assert cache.async_get_unique(new_shared) == new_shared - # And the counter accounting must still match observable state. total = sum(len(store) for store in cache.cache.values()) assert total == cache._total_records -def test_cache_eviction_rebuilds_heap_when_mostly_stale() -> None: - """Eviction rebuilds ``_expire_heap`` up front when stale entries dominate. - - Without the rebuild, a single ``_async_add`` at cap could pop a long - stale prefix one entry at a time — O(stale_prefix * log n). The - rebuild is the same heuristic ``async_expire`` already uses: when - ``len(heap) > 2 * len(expirations)``, the heap is reconstructed - from only the valid entries in one O(n) pass before the pop loop. - """ +def test_cache_re_add_flood_does_not_grow_heap_unbounded() -> None: + """Replaying cached records with shifting TTLs cannot grow ``_expire_heap`` unbounded.""" cache = r.DNSCache() now = r.current_time_millis() - base_kwargs = {"type_": const._TYPE_A, "class_": const._CLASS_IN} - - cache.async_add_records( - r.DNSAddress( - f"r-{i}.local.", - ttl=120, - address=bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), - created=now + i, - **base_kwargs, - ) - for i in range(const._MAX_CACHE_RECORDS) - ) - # Two re-adds with distinct TTLs leave two stale entries per record - # in ``_expire_heap`` while ``_expirations`` stays at MAX. Heap reaches - # ~3x MAX, which trips the ``len(heap) > 2 * len(expirations)`` - # threshold on the next eviction. - for ttl in (7200, 14400): + # Stay below the cache cap so eviction never fires; the attack here is + # heap growth via re-add, not cap saturation. Clear the + # ``_MIN_SCHEDULED_RECORD_EXPIRATION`` floor so the rebuild engages. + record_count = 200 + cache.async_add_records(_addr(f"flood-{i}.local.", i, created=now) for i in range(record_count)) + assert cache._total_records == record_count + + # 10 cycles x ``record_count`` stale pushes each. Without + # ``_maybe_rebuild_heap`` firing inside ``_async_add``, the heap would + # grow to ~11 x record_count. + for cycle in range(10): cache.async_add_records( - r.DNSAddress( - f"r-{i}.local.", - ttl=ttl, - address=bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), - created=now + i, - **base_kwargs, - ) - for i in range(const._MAX_CACHE_RECORDS) + _addr(f"flood-{i}.local.", i, ttl=7200 + cycle, created=now) for i in range(record_count) ) - assert len(cache._expire_heap) > 2 * len(cache._expirations) - # One more insert pushes us over the cap; eviction fires and must - # rebuild before scanning. - cache.async_add_records( - [ - r.DNSAddress( - "trigger.local.", - ttl=120, - address=b"\xff\xff\xff\xff", - created=now + const._MAX_CACHE_RECORDS, - **base_kwargs, - ) - ] - ) - # Rebuild dropped the stale entries; heap and expirations now match. - assert len(cache._expire_heap) == len(cache._expirations) == const._MAX_CACHE_RECORDS - assert cache._total_records == const._MAX_CACHE_RECORDS + # Heap is bounded near the rebuild threshold; ``+ record_count`` of slack + # to stay resilient to where in a re-add cycle the rebuild last fired. + assert len(cache._expire_heap) <= 2 * len(cache._expirations) + record_count + assert cache._total_records == record_count def test_cache_eviction_decrements_total_records() -> None: - """Natural removal (goodbyes, expirations) must keep ``_total_records`` in sync.""" + """Natural removal (goodbyes, expirations) keeps ``_total_records`` in sync.""" cache = r.DNSCache() now = r.current_time_millis() - records = [ - r.DNSAddress( - f"sync-{i}.local.", - const._TYPE_A, - const._CLASS_IN, - 120, - bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), - created=now, - ) - for i in range(50) - ] + records = [_addr(f"sync-{i}.local.", i, created=now) for i in range(50)] cache.async_add_records(records) assert cache._total_records == 50 cache.async_remove_records(records[:20]) assert cache._total_records == 30 - # async_expire on a future time should drop the rest cache.async_expire(now + (200 * 1000)) assert cache._total_records == 0 assert not cache.cache def test_cache_total_records_invariant_under_mixed_ops() -> None: - """``_total_records`` must equal ``sum(len(store) for store in cache.values())``. - - Walks the counter through every code path that touches it — new inserts, - re-adds of an existing record (no increment), DNSService (extra - service_cache write), DNSNsec (stored but not counted as "new" by the - flag), shared-key inserts that empty their bucket on removal, full-cap - eviction, async_expire, manual async_remove_records — and asserts the - invariant after every step. Any future change that misses an increment - or doubles a decrement will trip this immediately; ``_total_records`` is - ``cdef unsigned int`` in the Cython build, so silent underflow would - otherwise propagate as a permanent eviction-on-every-add storm. - """ + """``_total_records`` stays equal to the sum of bucket sizes across all touched paths.""" cache = r.DNSCache() now = r.current_time_millis() def actual() -> int: return sum(len(store) for store in cache.cache.values()) - # Fresh inserts: counter follows insert count exactly. - addrs = [ - r.DNSAddress( - f"mix-{i}.local.", const._TYPE_A, const._CLASS_IN, 120, bytes((i, 0, 0, 1)), created=now + i - ) - for i in range(20) - ] + addrs = [_addr(f"mix-{i}.local.", i, created=now + i) for i in range(20)] cache.async_add_records(addrs) assert cache._total_records == actual() == 20 - # Re-add of an identical record: no increment (already present). + # Re-add of an identical record: no increment. cache.async_add_records([addrs[0]]) assert cache._total_records == actual() == 20 - # DNSService writes service_cache too — counter must still match cache size. + # DNSService writes service_cache too — counter still matches cache size. svc = r.DNSService("svc.local.", const._TYPE_SRV, const._CLASS_IN, 120, 0, 0, 80, "host.local.") cache.async_add_records([svc]) assert cache._total_records == actual() == 21 cache.async_remove_records([svc]) assert cache._total_records == actual() == 20 - # DNSNsec is stored but excluded from the "new" return value; the counter - # still tracks it because it occupies a bucket slot. - nsec = r.DNSNsec( - "nsec.local.", - const._TYPE_NSEC, - const._CLASS_IN, - 120, - "nsec.local.", - [const._TYPE_A], - ) + # DNSNsec is stored but excluded from the "new" return; counter tracks it anyway. + nsec = r.DNSNsec("nsec.local.", const._TYPE_NSEC, const._CLASS_IN, 120, "nsec.local.", [const._TYPE_A]) cache.async_add_records([nsec]) assert cache._total_records == actual() == 21 cache.async_remove_records([nsec]) assert cache._total_records == actual() == 20 - # Shared-key insert/remove: emptying a bucket removes the cache key, but - # the counter must still decrement only by the number of records gone. - shared_a = r.DNSAddress( - "shared.local.", const._TYPE_A, const._CLASS_IN, 120, b"\x01\x01\x01\x01", created=now - ) - shared_b = r.DNSAddress( - "shared.local.", const._TYPE_A, const._CLASS_IN, 120, b"\x02\x02\x02\x02", created=now - ) + # Shared-key insert/remove: emptying the bucket drops the cache key but + # counter decrements only by the records that left. + shared_a = _addr("shared.local.", 0x0101, created=now) + shared_b = _addr("shared.local.", 0x0202, created=now) cache.async_add_records([shared_a, shared_b]) assert cache._total_records == actual() == 22 cache.async_remove_records([shared_a, shared_b]) assert cache._total_records == actual() == 20 assert "shared.local." not in cache.cache - # async_expire path: counter must follow the records dropped. cache.async_expire(now + (200 * 1000)) assert cache._total_records == actual() == 0 assert not cache.cache # Full-cap eviction loop: counter never grows past the cap, never drifts. - cap_records = [ - r.DNSAddress( - f"cap-{i}.local.", - const._TYPE_A, - const._CLASS_IN, - 120, - bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), - created=now + i, - ) - for i in range(const._MAX_CACHE_RECORDS + 50) - ] + cap_records = [_addr(f"cap-{i}.local.", i, created=now + i) for i in range(const._MAX_CACHE_RECORDS + 50)] for rec in cap_records: cache.async_add_records([rec]) assert cache._total_records == actual() From 052b1e2be2ec60e9697adec6640b221fae4331f3 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 May 2026 20:56:41 -0700 Subject: [PATCH 5/9] test: cover empty-heap exit branch in _async_evict_oldest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``_async_evict_oldest`` falls through to a silent return if ``_expire_heap`` is empty. By the cache invariant every record counted in ``_total_records`` has a heap entry, so this branch is unreachable in normal operation — but it's a defense-in-depth path worth pinning. Force the broken state directly (``_total_records = MAX, _expire_heap = []``) and verify eviction returns gracefully while the subsequent insert still completes. --- tests/test_cache.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_cache.py b/tests/test_cache.py index a802d851..7c095911 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -519,6 +519,19 @@ def test_cache_size_is_bounded() -> None: assert f"flood-{i}.local." in cache.cache +def test_cache_eviction_empty_heap_returns_without_evicting() -> None: + """Eviction tolerates an empty ``_expire_heap`` (invariant-violation safety net).""" + cache = r.DNSCache() + # By the cache invariant every record in ``_total_records`` has a heap + # entry, so eviction should never see an empty heap. Force the broken + # state directly to pin the defensive behaviour: ``_async_evict_oldest`` + # returns without raising and the subsequent insert still lands. + cache._total_records = const._MAX_CACHE_RECORDS + cache._expire_heap = [] + cache.async_add_records([_addr("post-empty.local.", 0)]) + assert "post-empty.local." in cache.cache + + def test_cache_eviction_skips_stale_heap_entries() -> None: """Eviction skips stale heap entries left by TTL re-adds.""" cache = r.DNSCache() From a2a2d34c62acb32c16630e7966ec42a28caf3413 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 May 2026 21:09:47 -0700 Subject: [PATCH 6/9] test: pin counter drift, restructure at-cap bench, trim docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three review follow-ups on #1718: **Bench actually measures eviction (Copilot inline at L43).** ``test_cache_add_at_cap_evicts`` previously created an empty cache and added ``_MAX_CACHE_RECORDS + 1000`` records, so ~91% of the measured time was the below-cap fill path and only the final 1000 inserts evicted — the CodSpeed number was not representative of the eviction performance the name implies. Restructure to pre-fill the cache to the cap *outside* the timed body and consume one fresh unique record per benchmark iteration. Each iteration measures exactly one ``_async_add`` + ``_async_evict_oldest`` cycle, with the cache permanently at the cap. ``_make_records`` gains a ``prefix=`` parameter so the fill records and the overflow pool don't collide on ``name``. **Defensive test pins counter state (Kōan suggestion #3).** ``test_cache_eviction_empty_heap_returns_without_evicting`` already asserts the new record lands; add the corresponding ``_total_records == _MAX_CACHE_RECORDS + 1`` assertion so a future change to the broken-state recovery (refuse the add, clamp the counter) fails this test explicitly rather than silently shifting behaviour. **One-line docstring (Kōan suggestion #2).** ``_maybe_rebuild_heap`` had an 11-line docstring restating the body and including amortized-cost analysis. Per CLAUDE.md the why lives at the call site comment in ``_async_add`` (already does); the function's contract is a single line. **No log on the empty-heap fall-through (Kōan suggestion #1).** Kōan suggested an assertion or debug log; the assertion would crash production on invariant drift, and even a guarded ``log.debug(...)`` adds Python attribute-lookup overhead in the function the cap overflow path runs through. Drift is already caught loudly by ``test_cache_total_records_invariant_under_mixed_ops``. Comment explains the deliberate silence. --- src/zeroconf/_cache.py | 17 ++++------------- tests/benchmarks/test_cache_bound.py | 25 +++++++++++++++++-------- tests/test_cache.py | 6 +++++- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/src/zeroconf/_cache.py b/src/zeroconf/_cache.py index 3c45b733..c9249946 100644 --- a/src/zeroconf/_cache.py +++ b/src/zeroconf/_cache.py @@ -135,21 +135,12 @@ def _async_evict_oldest(self) -> None: continue self._async_remove(record) return + # Heap-empty fall-through is unreachable by the cache invariant + # (every counted record has a heap entry); accounting drift would + # surface in tests via test_cache_total_records_invariant_under_mixed_ops. def _maybe_rebuild_heap(self) -> None: - """Rebuild ``_expire_heap`` if stale entries dominate. - - Re-adds of an existing record with a new TTL append a fresh - ``(when, record)`` and leave the prior tuple behind as stale; - eviction's stale-skip loop and ``async_expire`` already absorb - these, but unchecked accumulation lets a peer that just replays - cached records grow the heap arbitrarily between cleanups. - Same threshold as the long-standing rebuild in ``async_expire``: - only fire when stale entries outweigh live ones (heap > 2 x - expirations), and only above a minimum floor so a small cache - isn't rebuilt for nothing. Amortized cost is O(1) per push; - the O(N) rebuild fires at most once per N stale pushes. - """ + """Rebuild ``_expire_heap`` when stale entries dominate live ones.""" expire_heap_len = len(self._expire_heap) if ( expire_heap_len > _MIN_SCHEDULED_RECORD_EXPIRATION diff --git a/tests/benchmarks/test_cache_bound.py b/tests/benchmarks/test_cache_bound.py index 0ccb6bdd..07fbf107 100644 --- a/tests/benchmarks/test_cache_bound.py +++ b/tests/benchmarks/test_cache_bound.py @@ -8,14 +8,14 @@ from zeroconf.const import _CLASS_IN, _MAX_CACHE_RECORDS, _TYPE_A -def _make_records(count: int, now: float) -> list[DNSAddress]: +def _make_records(count: int, now: float, prefix: str = "bench") -> list[DNSAddress]: return [ DNSAddress( - f"bench-{i}.local.", + f"{prefix}-{i}.local.", _TYPE_A, _CLASS_IN, 120, - bytes((i & 0xFF, (i >> 8) & 0xFF, 0, 1)), + bytes(((i >> 24) & 0xFF, (i >> 16) & 0xFF, (i >> 8) & 0xFF, i & 0xFF)), created=now + i, ) for i in range(count) @@ -34,11 +34,20 @@ def _add() -> None: def test_cache_add_at_cap_evicts(benchmark: BenchmarkFixture) -> None: - """Steady-state add at the cap: every new record forces one eviction.""" + """Steady-state add at the cap: every measured insert forces one eviction. + + Pre-fills the cache to ``_MAX_CACHE_RECORDS`` outside the timed body so + only the eviction-path adds are measured. Each benchmark iteration + consumes one fresh unique record from a pre-built pool, keeping the + cache permanently at the cap and the work per iteration to a single + ``_async_add`` + ``_async_evict_oldest`` cycle. + """ now = current_time_millis() - overflow_records = _make_records(_MAX_CACHE_RECORDS + 1000, now) + cache = DNSCache() + cache.async_add_records(_make_records(_MAX_CACHE_RECORDS, now, prefix="fill")) + # Large pool so the iterator outlives any reasonable codspeed run count. + pool = iter(_make_records(100_000, now + _MAX_CACHE_RECORDS, prefix="evict")) @benchmark - def _flood() -> None: - cache = DNSCache() - cache.async_add_records(overflow_records) + def _evict_one() -> None: + cache.async_add_records([next(pool)]) diff --git a/tests/test_cache.py b/tests/test_cache.py index 7c095911..8c4fc47b 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -525,11 +525,15 @@ def test_cache_eviction_empty_heap_returns_without_evicting() -> None: # By the cache invariant every record in ``_total_records`` has a heap # entry, so eviction should never see an empty heap. Force the broken # state directly to pin the defensive behaviour: ``_async_evict_oldest`` - # returns without raising and the subsequent insert still lands. + # returns without raising and the subsequent insert still lands. Since + # eviction can't free space, the counter is allowed to drift past the + # cap by exactly one — pinned so a future change to the recovery + # semantics (e.g., refusing the add or clamping) fails this test. cache._total_records = const._MAX_CACHE_RECORDS cache._expire_heap = [] cache.async_add_records([_addr("post-empty.local.", 0)]) assert "post-empty.local." in cache.cache + assert cache._total_records == const._MAX_CACHE_RECORDS + 1 def test_cache_eviction_skips_stale_heap_entries() -> None: From 38b151779db3bf4351c5902366fed16f9b14c1c2 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 May 2026 21:13:02 -0700 Subject: [PATCH 7/9] test: pin DNSNsec flood is bounded by _MAX_CACHE_RECORDS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ``new = is_new and not isinstance(record, DNSNsec)`` line in ``_async_add`` only gates the listener-notify return value — the counter increment, the cache store, and the cap check are all type-agnostic. A peer can't flood NSec records past the cap today. But the existing ``new`` flag is exactly the shape a future refactor might extend to ``_total_records`` ("DNSNsec isn't real data, don't count it"), which would silently open a bypass. Pin ``_total_records == _MAX_CACHE_RECORDS`` after pushing ``_MAX_CACHE_RECORDS + 100`` unique-name DNSNsec records so any such regression fails this test loudly. --- tests/test_cache.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_cache.py b/tests/test_cache.py index 8c4fc47b..31065076 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -583,6 +583,26 @@ def test_cache_eviction_victim_shares_key_with_new_record() -> None: assert total == cache._total_records +def test_cache_dnsnsec_flood_is_bounded() -> None: + """DNSNsec records honour ``_MAX_CACHE_RECORDS`` (no bypass via the ``new`` flag).""" + cache = r.DNSCache() + overflow = 100 + cache.async_add_records( + r.DNSNsec( + f"nsec-{i}.local.", + const._TYPE_NSEC, + const._CLASS_IN, + 120, + f"nsec-{i}.local.", + [const._TYPE_A], + ) + for i in range(const._MAX_CACHE_RECORDS + overflow) + ) + assert cache._total_records == const._MAX_CACHE_RECORDS + total = sum(len(store) for store in cache.cache.values()) + assert total == const._MAX_CACHE_RECORDS + + def test_cache_re_add_flood_does_not_grow_heap_unbounded() -> None: """Replaying cached records with shifting TTLs cannot grow ``_expire_heap`` unbounded.""" cache = r.DNSCache() From e5b509a93107ede3467c9d98cbcc789613230679 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 May 2026 21:21:07 -0700 Subject: [PATCH 8/9] test: DNSNsec at-cap eviction + unbounded benchmark pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Cross-type NSec spot-test (Kōan).** Add ``test_cache_dnsnsec_at_cap_evicts_prior_record``: fill the cache to ``_MAX_CACHE_RECORDS`` with ``DNSAddress`` records, add a single ``DNSNsec``, assert the counter stays at cap (NSEC routed through eviction), the NSEC is reachable via ``cache.cache[nsec.key]`` (orphan fix covers NSEC's key-collision path), and the earliest fill record was the eviction victim. Complements the existing flood test — together they pin NSEC against both ``new``-flag drift and cross-type ``_async_add`` regressions. Uses direct cache lookup instead of ``async_get_unique`` because the latter's type stub excludes ``DNSNsec``. **Unbounded benchmark generator (Copilot inline at test_cache_bound.py:53).** The previous bench used ``iter(_make_records(100_000, ...))`` which raises ``StopIteration`` once exhausted — fast operations under pytest-codspeed can exceed that count. Replace with ``_unbounded_records`` (``itertools.count`` + lazy ``DNSAddress`` construction) so the iterator survives any iteration count CodSpeed chooses. The construction work moves into the timed body but is a constant per-iteration overhead and doesn't distort relative measurements across runs. --- tests/benchmarks/test_cache_bound.py | 29 +++++++++++++++++++++------- tests/test_cache.py | 25 ++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/tests/benchmarks/test_cache_bound.py b/tests/benchmarks/test_cache_bound.py index 07fbf107..774129e3 100644 --- a/tests/benchmarks/test_cache_bound.py +++ b/tests/benchmarks/test_cache_bound.py @@ -2,13 +2,16 @@ from __future__ import annotations +from collections.abc import Iterator +from itertools import count + from pytest_codspeed import BenchmarkFixture from zeroconf import DNSAddress, DNSCache, current_time_millis from zeroconf.const import _CLASS_IN, _MAX_CACHE_RECORDS, _TYPE_A -def _make_records(count: int, now: float, prefix: str = "bench") -> list[DNSAddress]: +def _make_records(count_: int, now: float, prefix: str = "bench") -> list[DNSAddress]: return [ DNSAddress( f"{prefix}-{i}.local.", @@ -18,10 +21,23 @@ def _make_records(count: int, now: float, prefix: str = "bench") -> list[DNSAddr bytes(((i >> 24) & 0xFF, (i >> 16) & 0xFF, (i >> 8) & 0xFF, i & 0xFF)), created=now + i, ) - for i in range(count) + for i in range(count_) ] +def _unbounded_records(now: float, prefix: str = "evict") -> Iterator[DNSAddress]: + """Unbounded generator of unique-name DNSAddress records.""" + for i in count(): + yield DNSAddress( + f"{prefix}-{i}.local.", + _TYPE_A, + _CLASS_IN, + 120, + bytes(((i >> 24) & 0xFF, (i >> 16) & 0xFF, (i >> 8) & 0xFF, i & 0xFF)), + created=now + i, + ) + + def test_cache_add_below_cap(benchmark: BenchmarkFixture) -> None: """Adding records while the cache is well below the cap (no eviction).""" now = current_time_millis() @@ -38,15 +54,14 @@ def test_cache_add_at_cap_evicts(benchmark: BenchmarkFixture) -> None: Pre-fills the cache to ``_MAX_CACHE_RECORDS`` outside the timed body so only the eviction-path adds are measured. Each benchmark iteration - consumes one fresh unique record from a pre-built pool, keeping the - cache permanently at the cap and the work per iteration to a single - ``_async_add`` + ``_async_evict_oldest`` cycle. + pulls one fresh unique record from an unbounded generator, keeping the + cache permanently at the cap. The generator avoids the iteration-count + cap that a pre-built pool would impose for very fast operations. """ now = current_time_millis() cache = DNSCache() cache.async_add_records(_make_records(_MAX_CACHE_RECORDS, now, prefix="fill")) - # Large pool so the iterator outlives any reasonable codspeed run count. - pool = iter(_make_records(100_000, now + _MAX_CACHE_RECORDS, prefix="evict")) + pool = _unbounded_records(now + _MAX_CACHE_RECORDS) @benchmark def _evict_one() -> None: diff --git a/tests/test_cache.py b/tests/test_cache.py index 31065076..aeb3a2ab 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -583,6 +583,31 @@ def test_cache_eviction_victim_shares_key_with_new_record() -> None: assert total == cache._total_records +def test_cache_dnsnsec_at_cap_evicts_prior_record() -> None: + """A single DNSNsec arriving at the cap evicts one prior record and stays reachable.""" + cache = r.DNSCache() + now = r.current_time_millis() + cache.async_add_records( + _addr(f"fill-{i}.local.", i, created=now + i) for i in range(const._MAX_CACHE_RECORDS) + ) + assert cache._total_records == const._MAX_CACHE_RECORDS + + nsec = r.DNSNsec( + "nsec-arrival.local.", + const._TYPE_NSEC, + const._CLASS_IN, + 120, + "nsec-arrival.local.", + [const._TYPE_A], + ) + cache.async_add_records([nsec]) + + assert cache._total_records == const._MAX_CACHE_RECORDS + assert nsec in cache.cache[nsec.key] + # The earliest-created fill record is gone (FIFO-ish eviction). + assert "fill-0.local." not in cache.cache + + def test_cache_dnsnsec_flood_is_bounded() -> None: """DNSNsec records honour ``_MAX_CACHE_RECORDS`` (no bypass via the ``new`` flag).""" cache = r.DNSCache() From 39ab7550d565fcb985b012275044e3b82fb0e58c Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Sun, 17 May 2026 21:23:26 -0700 Subject: [PATCH 9/9] chore: drop stale comment from _async_evict_oldest fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment said drift would surface via ``test_cache_total_records_invariant_under_mixed_ops``, but that test only exercises well-formed paths and never produces a heap-empty-with-counter-nonzero state. The test that actually pins the broken-invariant recovery is ``test_cache_eviction_empty_heap_returns_without_evicting``. Per ``CLAUDE.md`` ("default to no comments"; cross-references rot when referents are renamed) the comment shouldn't exist at all — the function name and docstring already describe the contract, and the while-loop fall-through is self-explanatory. --- src/zeroconf/_cache.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/zeroconf/_cache.py b/src/zeroconf/_cache.py index c9249946..df60982b 100644 --- a/src/zeroconf/_cache.py +++ b/src/zeroconf/_cache.py @@ -135,9 +135,6 @@ def _async_evict_oldest(self) -> None: continue self._async_remove(record) return - # Heap-empty fall-through is unreachable by the cache invariant - # (every counted record has a heap entry); accounting drift would - # surface in tests via test_cache_total_records_invariant_under_mixed_ops. def _maybe_rebuild_heap(self) -> None: """Rebuild ``_expire_heap`` when stale entries dominate live ones."""