Fix null-separated ASCII misdetected as UTF-16-BE (#347)

dan-blanchard · claude · web-flow · commit 89a9a4cb0cd0 · 2026-03-17T14:29:31.000-04:00
* docs: add design spec for null separator tolerance Addresses #346 — ASCII text with null byte separators (common in Unix CLI output) being misdetected as utf-16-be. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * docs: address spec review feedback - Clarify UTF-16 guard applies in both single/dual candidate paths - Note mypyc compilation constraint for utf1632.py - Detail ASCII implementation using existing _ALLOWED_ASCII table - Clarify pipeline reorder: computation order vs return order - Note UniversalDetector propagation - Fix language=None vs "" discrepancy in test expectations Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * docs: add implementation plan for null separator tolerance 6-task TDD plan covering UTF-16 guard, null-tolerant ASCII detection, and pipeline reorder. Addresses #346. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: add failing tests for null-separator UTF-16 false positive (#346) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: reject null-separator false positives in UTF-16 detector (#346) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * test: add failing tests for null-tolerant ASCII detection (#346) * feat: tolerate sparse null separators in ASCII detection (#346) * fix: reorder pipeline so ASCII precheck prevents false binary classification (#346) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * chore: remove planning docs from branch Spec and plan are preserved in git history but don't need to be in the final merge diff. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: tighten ASCII null-fraction threshold from 10% to 5% Real-world null-separator data (find -print0, git ls-tree -z) is 1-3.5% nulls. 5% covers all realistic cases while staying well below the UTF-16 guard threshold (15%). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * refactor: use bytes.translate in null-separator guard for consistency Replace Python-level all() loop with C-level bytes.translate(), matching the pattern used in binary.py and ascii.py. Cross-references the shared ASCII byte set with ascii.py's _ALLOWED_ASCII. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * refactor: share ASCII byte-set constant across pipeline modules Extract ASCII_TEXT_BYTES to pipeline/__init__.py and use it in both ascii.py and utf1632.py to prevent drift between the two definitions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/src/chardet/pipeline/__init__.py b/src/chardet/pipeline/__init__.py
@@ -15,6 +15,11 @@
 #: Deleting all bytes >= 0x80 and comparing lengths gives the non-ASCII count.
 HIGH_BYTES: bytes = bytes(range(0x80, 0x100))
 
+#: Bytes considered valid in ASCII text: tab (0x09), newline (0x0A),
+#: carriage return (0x0D), and printable ASCII (0x20-0x7E).
+#: Used by ``ascii.py`` directly and by ``utf1632.py`` (with null added).
+ASCII_TEXT_BYTES: bytes = bytes([0x09, 0x0A, 0x0D, *range(0x20, 0x7F)])
+
 
 class DetectionDict(TypedDict):
     """Dictionary representation of a detection result.
diff --git a/src/chardet/pipeline/ascii.py b/src/chardet/pipeline/ascii.py
@@ -1,23 +1,36 @@
-"""Stage 1c: Pure ASCII detection."""
+"""Stage 1c: Pure ASCII detection (with null-separator tolerance)."""
 
 from __future__ import annotations
 
-from chardet.pipeline import DetectionResult
+from chardet.pipeline import ASCII_TEXT_BYTES, DetectionResult
 
-# Allowed ASCII bytes: tab (0x09), newline (0x0A), carriage return (0x0D),
-# and printable ASCII (0x20-0x7E).  bytes.translate deletes these from the
-# input; if anything remains, the data is not pure ASCII.
-_ALLOWED_ASCII: bytes = bytes([0x09, 0x0A, 0x0D, *range(0x20, 0x7F)])
+# Maximum fraction of null bytes to still classify data as ASCII.
+# Null-separated CLI output (find -print0, git ls-tree -z) typically has
+# 1-3.5% nulls.  5% covers all realistic cases while staying well below
+# the UTF-16 guard threshold (15%).
+_MAX_NULL_FRACTION = 0.05
 
 
 def detect_ascii(data: bytes) -> DetectionResult | None:
-    """Return an ASCII result if all bytes are printable ASCII plus common whitespace.
+    r"""Return an ASCII result if all bytes are printable ASCII plus common whitespace.
+
+    Tolerates sparse null bytes (``\x00``) up to ``_MAX_NULL_FRACTION`` of
+    the data, returning confidence 0.99 instead of 1.0 to distinguish from
+    pure ASCII.
 
     :param data: The raw byte data to examine.
     :returns: A :class:`DetectionResult` for ASCII, or ``None``.
     """
     if not data:
         return None
-    if data.translate(None, _ALLOWED_ASCII):
-        return None  # Non-allowed bytes remain
-    return DetectionResult(encoding="ascii", confidence=1.0, language=None)
+    remainder = data.translate(None, ASCII_TEXT_BYTES)
+    if not remainder:
+        return DetectionResult(encoding="ascii", confidence=1.0, language=None)
+    # Check if the only non-allowed bytes are null separators
+    if remainder.replace(b"\x00", b""):
+        return None  # Non-null, non-ASCII bytes present
+    # All non-allowed bytes are nulls — accept if sparse enough
+    null_fraction = len(remainder) / len(data)
+    if null_fraction <= _MAX_NULL_FRACTION:
+        return DetectionResult(encoding="ascii", confidence=0.99, language=None)
+    return None
diff --git a/src/chardet/pipeline/orchestrator.py b/src/chardet/pipeline/orchestrator.py
@@ -535,9 +535,20 @@ def _run_pipeline_core(  # noqa: PLR0913
     # markup) so that explicit charset declarations still take precedence.
     utf8_precheck = detect_utf8(data)
 
-    # Stage 0: Binary detection (skip when data is valid multi-byte UTF-8)
+    # Pre-check ASCII to prevent false binary classification.  ASCII text
+    # with null byte separators (e.g. find -print0 output) would exceed the
+    # binary threshold due to the null bytes.  Like the UTF-8 precheck, we
+    # compute the result now but return it at the normal position (after
+    # markup) so explicit charset declarations still take precedence.
+    ascii_precheck = detect_ascii(data)
+
+    # Stage 0: Binary detection (skip when data is valid UTF-8 or ASCII)
     # Binary detection (encoding=None) is NOT gated by filters.
-    if utf8_precheck is None and is_binary(data, max_bytes=max_bytes):
+    if (
+        utf8_precheck is None
+        and ascii_precheck is None
+        and is_binary(data, max_bytes=max_bytes)
+    ):
         return [_BINARY_RESULT]
 
     # Stage 1b: Markup charset extraction (before ASCII/UTF-8 so explicit
@@ -547,10 +558,9 @@ def _run_pipeline_core(  # noqa: PLR0913
     if markup_result is not None and markup_result.encoding in allowed:
         return [markup_result]
 
-    # Stage 1c: ASCII
-    ascii_result = detect_ascii(data)
-    if ascii_result is not None and ascii_result.encoding in allowed:
-        return [ascii_result]
+    # Stage 1c: ASCII (use pre-computed result)
+    if ascii_precheck is not None and ascii_precheck.encoding in allowed:
+        return [ascii_precheck]
 
     # Stage 1d: UTF-8 structural validation (use pre-computed result)
     if utf8_precheck is not None and utf8_precheck.encoding in allowed:
diff --git a/src/chardet/pipeline/utf1632.py b/src/chardet/pipeline/utf1632.py
@@ -11,7 +11,7 @@
 
 import unicodedata
 
-from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult
+from chardet.pipeline import ASCII_TEXT_BYTES, DETERMINISTIC_CONFIDENCE, DetectionResult
 
 # How many bytes to sample for pattern analysis
 _SAMPLE_SIZE = 4096
@@ -38,6 +38,38 @@
 # considered text rather than binary data.
 _MIN_PRINTABLE_FRACTION = 0.7
 
+# Maximum null fraction (in the candidate null-byte position) below which
+# the data is checked for a null-separator pattern.  If the null fraction
+# is below this AND all non-null bytes are printable ASCII, the candidate
+# is rejected as a null-separator false positive rather than real UTF-16.
+# Real Latin UTF-16 has ~50% nulls; CJK UTF-16 has fewer but non-ASCII
+# non-null bytes.  15% is generous — separator data is typically 1-5%.
+_NULL_SEPARATOR_MAX_FRACTION = 0.15
+
+# ASCII_TEXT_BYTES plus the null byte — used by the null-separator guard
+# to check whether non-null bytes are all printable ASCII.
+_NULL_SEPARATOR_ALLOWED: bytes = b"\x00" + ASCII_TEXT_BYTES
+
+
+def _is_null_separator_pattern(data: bytes, null_frac: float) -> bool:
+    """Return True if the data looks like ASCII with null byte separators.
+
+    :param data: The raw byte sample to examine.
+    :param null_frac: The positional null fraction for this UTF-16 candidate
+        (i.e. fraction of null bytes in even positions for BE, or odd positions
+        for LE) — not the total null fraction across all bytes.
+
+    Checks two conditions:
+    1. The positional null fraction is below ``_NULL_SEPARATOR_MAX_FRACTION``
+    2. Every non-null byte is printable ASCII or common whitespace
+
+    When both conditions are met, the nulls are likely field separators
+    (e.g. ``find -print0``), not UTF-16 encoding artifacts.
+    """
+    if null_frac >= _NULL_SEPARATOR_MAX_FRACTION:
+        return False
+    return not data.translate(None, _NULL_SEPARATOR_ALLOWED)
+
 
 def detect_utf1632_patterns(data: bytes) -> DetectionResult | None:
     """Detect UTF-32 or UTF-16 encoding from null-byte patterns.
@@ -149,9 +181,13 @@ def _check_utf16(data: bytes) -> DetectionResult | None:
     le_frac = le_null_count / num_units
 
     candidates: list[tuple[str, float]] = []
-    if le_frac >= _UTF16_MIN_NULL_FRACTION:
+    if le_frac >= _UTF16_MIN_NULL_FRACTION and not _is_null_separator_pattern(
+        data[:sample_len], le_frac
+    ):
         candidates.append(("utf-16-le", le_frac))
-    if be_frac >= _UTF16_MIN_NULL_FRACTION:
+    if be_frac >= _UTF16_MIN_NULL_FRACTION and not _is_null_separator_pattern(
+        data[:sample_len], be_frac
+    ):
         candidates.append(("utf-16-be", be_frac))
 
     if not candidates:
diff --git a/tests/test_ascii.py b/tests/test_ascii.py
@@ -41,7 +41,67 @@ def test_all_printable_ascii():
 
 
 def test_null_byte_not_ascii():
-    # Null bytes should have been caught by binary detection (Stage 0),
-    # but ASCII check should still reject them
-    result = detect_ascii(b"Hello\x00world")
+    # 2 nulls in 10 bytes = 20% → above threshold, not ASCII
+    result = detect_ascii(b"Hello\x00\x00rld")
     assert result is None
+
+
+def test_ascii_with_sparse_null_separators():
+    """ASCII with null separators below 5% threshold → confidence 0.99."""
+    data = (
+        b"master:README.md\x002\x00For support slack to #kodiak-support\n"
+        b"master:support.txt\x001\x00For support slack to #kodiak-support\n"
+    )
+    result = detect_ascii(data)
+    assert result is not None
+    assert result.encoding == "ascii"
+    assert result.confidence == 0.99
+
+
+def test_ascii_with_null_separated_paths():
+    """Find -print0 style output → ASCII at 0.99."""
+    data = (
+        b"/home/user/documents/report.txt\x00"
+        b"/home/user/documents/notes.txt\x00"
+        b"/home/user/downloads/image.png\x00"
+        b"/home/user/music/song.mp3\x00"
+    )
+    result = detect_ascii(data)
+    assert result is not None
+    assert result.encoding == "ascii"
+    assert result.confidence == 0.99
+
+
+def test_ascii_with_null_at_boundary():
+    """Exactly 5% nulls (1 in 20 bytes) is at the threshold — still ASCII."""
+    result = detect_ascii(b"abcdefghij\x00klmnopqrs")  # 1/20 = 5%
+    assert result is not None
+    assert result.encoding == "ascii"
+    assert result.confidence == 0.99
+
+
+def test_ascii_with_null_just_above_boundary():
+    """Just above 5% nulls → not ASCII."""
+    result = detect_ascii(b"abcdefghij\x00klmnopqr")  # 1/19 = 5.26%
+    assert result is None
+
+
+def test_ascii_with_high_null_fraction():
+    """More than 5% null bytes → not ASCII."""
+    # 5 nulls in 15 bytes = 33%
+    data = b"ab\x00cd\x00ef\x00gh\x00ij\x00"
+    result = detect_ascii(data)
+    assert result is None
+
+
+def test_ascii_with_nulls_and_high_bytes():
+    """Nulls mixed with non-ASCII bytes → not ASCII."""
+    data = b"Hello\x00\x80World"
+    result = detect_ascii(data)
+    assert result is None
+
+
+def test_pure_ascii_still_confidence_1():
+    """Pure ASCII without nulls still returns confidence 1.0."""
+    result = detect_ascii(b"Hello, world!")
+    assert result == DetectionResult("ascii", 1.0, None)
diff --git a/tests/test_github_issues.py b/tests/test_github_issues.py
@@ -438,3 +438,34 @@ def test_issue_67_no_crash(self) -> None:
         # Just verify it doesn't crash; any result is acceptable
         assert isinstance(result, dict)
         assert "encoding" in result
+
+
+# =========================================================================
+# NULL SEPARATOR ISSUES
+# =========================================================================
+
+
+class TestNullSeparators:
+    """ASCII text with null byte separators."""
+
+    def test_issue_346_null_separated_ascii(self) -> None:
+        """Issue #346: Null-separated ASCII detected as utf-16-be."""
+        data = (
+            b"master:README.md\x002\x00For support slack to #kodiak-support\n"
+            b"master:support.txt\x001\x00For support slack to #kodiak-support\n"
+        )
+        result = chardet.detect(data)
+        assert result["encoding"] == "ascii"
+        assert result["confidence"] == 0.99
+
+    def test_find_print0_output(self) -> None:
+        """Find -print0 style output should be detected as ASCII."""
+        data = (
+            b"/home/user/documents/report.txt\x00"
+            b"/home/user/documents/notes.txt\x00"
+            b"/home/user/downloads/image.png\x00"
+            b"/home/user/music/song.mp3\x00"
+        )
+        result = chardet.detect(data)
+        assert result["encoding"] == "ascii"
+        assert result["confidence"] == 0.99
diff --git a/tests/test_utf1632.py b/tests/test_utf1632.py
@@ -569,3 +569,52 @@ def test_text_quality_no_letters() -> None:
     quality = _text_quality(text)
     # No letters, so letter ratio is 0, ascii bonus is 0
     assert quality < 0.5
+
+
+# ---------------------------------------------------------------------------
+# Null-separator guard: sparse nulls in ASCII should NOT trigger UTF-16
+# ---------------------------------------------------------------------------
+
+
+def test_null_separated_ascii_not_utf16() -> None:
+    """ASCII with null byte separators should not be detected as UTF-16.
+
+    Regression test for chardet/chardet#346.
+    """
+    data = (
+        b"master:README.md\x002\x00For support slack to #kodiak-support\n"
+        b"master:support.txt\x001\x00For support slack to #kodiak-support\n"
+    )
+    result = detect_utf1632_patterns(data)
+    assert result is None
+
+
+def test_null_separated_paths_not_utf16() -> None:
+    """Find -print0 style output should not be detected as UTF-16."""
+    data = (
+        b"/home/user/documents/report.txt\x00"
+        b"/home/user/documents/notes.txt\x00"
+        b"/home/user/downloads/image.png\x00"
+        b"/home/user/music/song.mp3\x00"
+    )
+    result = detect_utf1632_patterns(data)
+    assert result is None
+
+
+def test_real_utf16_be_still_detected() -> None:
+    """Real UTF-16-BE text must still be detected after the guard is added."""
+    text = "The quick brown fox jumps over the lazy dog."
+    data = text.encode("utf-16-be")
+    result = detect_utf1632_patterns(data)
+    assert result is not None
+    assert result.encoding == "utf-16-be"
+    assert result.confidence == DETERMINISTIC_CONFIDENCE
+
+
+def test_real_utf16_le_cjk_still_detected() -> None:
+    """CJK UTF-16-LE must still be detected (low null fraction but non-ASCII non-null bytes)."""
+    text = "This document: \u4f60\u597d\u4e16\u754c\uff0c\u6b22\u8fce\u6765\u5230\u8fd9\u91cc\u3002"
+    data = text.encode("utf-16-le")
+    result = detect_utf1632_patterns(data)
+    assert result is not None
+    assert result.encoding == "utf-16-le"