fix: drain _seen_logs drift on hit path and pin FIFO by key identity

bdraco · bdraco · commit d6cddb9e559b · 2026-05-17T20:48:50.000-07:00
Two Copilot review follow-ups on #1717: - ``_mark_seen`` returned early on cache hits, so a drift-overshoot left behind by concurrent inserts (FT or sync-multi-instance callers racing the ``len < cap`` check) only drained on the next miss. A workload that's mostly hits after a contention burst could keep the dict permanently above the cap. Add a drift-drain ``while`` inside the hit branch, gated on ``len > _MAX_SEEN_LOGS`` so steady-state-at-cap hits remain a single ``len`` + compare past the membership check. The shared pop+race-tolerant idiom factors out into a small ``_evict_oldest`` helper invoked only when the outer ``while`` predicate is true, so the hot-path cost is unchanged. Pin the behaviour with ``test_mark_seen_drains_drift_on_hit_path``. - ``tests/test_protocol.py::test_seen_logs_is_bounded`` asserted FIFO eviction via substring matches on the exception text (``"'1.2.3.4', 0)" in k``). That tied the test to the current ``IncomingDecodeError`` message format, so a future normalization of dedup-key shape (as discussed on #1714) would break the test without changing the bounded/eviction behaviour. Snapshot the actual key the parser inserted per port via ``next(reversed(_seen_logs))`` and assert FIFO by key identity. Also pin ``len(set(keys_per_port)) == _MAX_SEEN_LOGS + overflow`` so a regression that dropped ``self.source`` from the exception text (collapsing all calls to one dedup key) still fails the test rather than silently passing — preserving the per-port- unique-key check from the prior Kōan review.
diff --git a/src/zeroconf/_logger.py b/src/zeroconf/_logger.py
@@ -43,6 +43,23 @@ def set_logger_level_if_unset() -> None:
 _seen_logs: dict[str, None] = {}
 
 
+def _evict_oldest(seen: dict[str, None]) -> bool:
+    """Pop the oldest entry from ``seen``; return False if it raced.
+
+    Individual dict ops (``pop`` with a default, ``next``) are atomic
+    on the free-threaded build, but the compound ``iter`` → ``next``
+    used to pick the FIFO victim can raise ``RuntimeError`` if
+    another thread mutates the dict between the two ops. The caller
+    breaks its drain loop on False so concurrent mutation can't make
+    it spin.
+    """
+    try:
+        seen.pop(next(iter(seen)), None)
+    except (RuntimeError, StopIteration):
+        return False
+    return True
+
+
 def _mark_seen(seen: dict[str, None], key: str) -> bool:
     """Record ``key`` in ``seen`` and return True if it was newly added.
 
@@ -53,26 +70,22 @@ def _mark_seen(seen: dict[str, None], key: str) -> bool:
 
     The dict is shared across all ``Zeroconf`` instances in the
     process; on the free-threaded build (3.14t) and under multi-
-    instance sync use, callers can race. Individual dict operations
-    (``in``, ``__setitem__``, ``pop``, ``len``) are atomic in CPython
-    3.13+ FT and don't crash, but the compound ``iter`` → ``next``
-    used to find the FIFO victim can raise ``RuntimeError`` if
-    another thread mutates the dict between the two ops. The
-    eviction loop drains until ``len(seen) < _MAX_SEEN_LOGS`` so that
-    any drift accumulated by prior concurrent inserts is corrected by
-    the next caller, and bails on ``RuntimeError`` (another thread is
-    already shrinking the set) so we don't spin.
+    instance sync use, callers can race the ``len < cap`` check and
+    both insert, leaving the dict transiently above the cap. The
+    drain loop runs on every call (steady-state-at-cap hits are a
+    single ``len`` + compare past the membership check because the
+    helper short-circuits) so a contention burst is corrected by the
+    next caller regardless of whether it's a hit or a miss.
     """
-    if key in seen:
-        return False
-    while len(seen) >= _MAX_SEEN_LOGS:
-        try:
-            oldest = next(iter(seen))
-        except (RuntimeError, StopIteration):
-            break
-        seen.pop(oldest, None)
-    seen[key] = None
-    return True
+    inserting = key not in seen
+    # Hit (``inserting`` is False): drain only if drifted above cap.
+    # Miss (``inserting`` is True): drain to ``cap - 1`` to make room
+    # for the new key. Bool subtracts as 0/1 to pick the right limit.
+    while len(seen) > _MAX_SEEN_LOGS - inserting and _evict_oldest(seen):
+        pass
+    if inserting:
+        seen[key] = None
+    return inserting
 
 
 class QuietLogger:
diff --git a/tests/test_logger.py b/tests/test_logger.py
@@ -126,6 +126,27 @@ def test_mark_seen_drains_drift_above_cap() -> None:
         assert f"k-{i}" not in seen
 
 
+def test_mark_seen_drains_drift_on_hit_path() -> None:
+    """``_mark_seen`` drains drift even when ``key`` is already cached.
+
+    A hit-heavy workload after a contention burst (e.g. the same
+    exception text deduplicated repeatedly) must still correct the
+    overshoot — otherwise the dict can sit permanently above the cap
+    until a miss happens to come along.
+    """
+    seen: dict[str, None] = {}
+    drift = 10
+    for i in range(_MAX_SEEN_LOGS + drift):
+        seen[f"k-{i}"] = None
+    # Hit on a non-oldest key — survives the drift drain.
+    hit_key = f"k-{_MAX_SEEN_LOGS}"
+    assert _mark_seen(seen, hit_key) is False
+    assert len(seen) == _MAX_SEEN_LOGS
+    assert hit_key in seen
+    for i in range(drift):
+        assert f"k-{i}" not in seen
+
+
 def test_seen_logs_is_bounded() -> None:
     """``_seen_logs`` stays at the cap and evicts oldest-first (FIFO)."""
     _logger._seen_logs.clear()
diff --git a/tests/test_protocol.py b/tests/test_protocol.py
@@ -973,17 +973,27 @@ def test_seen_logs_is_bounded():
     )
     overflow = 5
     _incoming_module._seen_logs.clear()
+    # Snapshot the actual key the parser inserted per port. This is whatever
+    # ``str(exc_info()[1])`` produces today — the test stays agnostic to the
+    # exception text format so a future normalization of the message (see
+    # the discussion on #1714) doesn't break the assertions, while still
+    # pinning that the parser exception path actually entered the dict.
+    keys_per_port: list[str] = []
     for port in range(_MAX_SEEN_LOGS + overflow):
         r.DNSIncoming(packet, ("1.2.3.4", port))
-    # Bound is hit exactly — confirms the parser exception path actually
-    # entered the dict with a per-port-unique key; a future change that
-    # dropped self.source from the exception text would collapse to a
-    # single dedup key and fail this assertion.
+        keys_per_port.append(next(reversed(_incoming_module._seen_logs)))
+    # Bound is hit exactly.
     assert len(_incoming_module._seen_logs) == _MAX_SEEN_LOGS
-    # FIFO eviction: the earliest port's exception string is gone, the
-    # latest port's is still present.
-    assert not any("'1.2.3.4', 0)" in k for k in _incoming_module._seen_logs)
-    assert any(f"'1.2.3.4', {_MAX_SEEN_LOGS + overflow - 1})" in k for k in _incoming_module._seen_logs)
+    # Each port produced a distinct dedup key — a regression that dropped
+    # the per-packet-varying component (e.g. self.source) from the exception
+    # text would collapse all 517 calls to one key and fail this.
+    assert len(set(keys_per_port)) == _MAX_SEEN_LOGS + overflow
+    # FIFO eviction by key identity (no substring matching on the message
+    # format): the earliest ports' keys are gone, the latest ports' remain.
+    for port in range(overflow):
+        assert keys_per_port[port] not in _incoming_module._seen_logs
+    for port in range(_MAX_SEEN_LOGS, _MAX_SEEN_LOGS + overflow):
+        assert keys_per_port[port] in _incoming_module._seen_logs
 
 
 def test_label_length_attack():