Skip to content

Commit 575fa96

Browse files
dan-blanchardclaude
andcommitted
test: add include_encodings accuracy preservation tests
Parametrized test covering 8 pipeline stages (UTF-8, BOM, escape, UTF-16, statistical/Cyrillic, statistical/Greek, structural/Japanese, structural/Chinese) to verify that include_encodings does not degrade detection accuracy when the correct encoding is in the candidate set. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 66e21fc commit 575fa96

1 file changed

Lines changed: 90 additions & 0 deletions

File tree

tests/test_api.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,3 +806,93 @@ def test_detect_all_custom_no_match_encoding():
806806
compat_names=False,
807807
)
808808
assert results[0]["encoding"] == "ascii"
809+
810+
811+
# --- include_encodings accuracy preservation tests ---
812+
813+
814+
@pytest.mark.parametrize(
815+
("data", "include_set", "expected"),
816+
[
817+
# UTF-8 stage: multibyte UTF-8 with Latin single-byte confusables
818+
pytest.param(
819+
"Héllo wörld café résumé naïve über straße".encode(),
820+
["utf-8", "cp1252", "iso8859-1"],
821+
"utf-8",
822+
id="utf8-with-latin-confusables",
823+
),
824+
# BOM stage: UTF-8 BOM with alternatives present
825+
pytest.param(
826+
"\ufeffHello world, this is a BOM test document.".encode("utf-8-sig"),
827+
["utf-8-sig", "utf-8", "cp1252"],
828+
"utf-8-sig",
829+
id="utf8-bom-with-alternatives",
830+
),
831+
# Escape stage: ISO-2022-JP with other Japanese encodings
832+
pytest.param(
833+
"こんにちは世界、これはテストです。".encode("iso2022_jp"),
834+
["iso2022_jp_2", "utf-8", "euc_jis_2004"],
835+
"iso2022_jp_2",
836+
id="iso2022-jp-with-alternatives",
837+
),
838+
# UTF-16 stage: BOM-less UTF-16-LE with endian confusable
839+
pytest.param(
840+
"Hello world, this is a longer UTF-16 test with café.".encode("utf-16-le"),
841+
["utf-16-le", "utf-16-be", "utf-8"],
842+
"utf-16-le",
843+
id="utf16-le-with-endian-confusable",
844+
),
845+
# Statistical stage: Cyrillic (windows-1251) with Cyrillic confusables
846+
pytest.param(
847+
(
848+
"Привет мир, как дела? Это тестовый текст на русском языке. "
849+
"Москва — столица России, крупнейший город страны."
850+
).encode("windows-1251"),
851+
["cp1251", "cp1252", "iso8859-5", "koi8-r"],
852+
"cp1251",
853+
id="windows-1251-with-cyrillic-confusables",
854+
),
855+
# Statistical stage: Greek with Latin confusables
856+
pytest.param(
857+
(
858+
"Η Αθήνα είναι η πρωτεύουσα και μεγαλύτερη πόλη της Ελλάδας. "
859+
"Η πόλη έχει μακρά ιστορία που εκτείνεται πάνω από τρεις χιλιετίες."
860+
).encode("iso-8859-7"),
861+
["iso8859-7", "cp1252", "iso8859-1"],
862+
"iso8859-7",
863+
id="greek-with-latin-confusables",
864+
),
865+
# Structural + Statistical: Japanese Shift-JIS with CJK confusables
866+
pytest.param(
867+
(
868+
"これはテストです。日本語のテキスト。東京は日本の首都です。"
869+
"人口は約1400万人で、世界最大の都市圏を形成しています。"
870+
).encode("shift_jis"),
871+
["shift_jis_2004", "euc_jis_2004", "gb18030", "big5hkscs"],
872+
"shift_jis_2004",
873+
id="shift-jis-with-cjk-confusables",
874+
),
875+
# Structural + Statistical: Chinese GB18030 with CJK confusables
876+
pytest.param(
877+
(
878+
"这是中文测试文本,用于检测编码。北京是中国的首都,上海是最大的城市。"
879+
).encode("gb18030"),
880+
["gb18030", "big5hkscs", "euc_kr"],
881+
"gb18030",
882+
id="gb18030-with-cjk-confusables",
883+
),
884+
],
885+
)
886+
def test_include_encodings_preserves_accuracy(
887+
data: bytes, include_set: list[str], expected: str
888+
) -> None:
889+
"""include_encodings must not degrade accuracy when the correct encoding is present.
890+
891+
Each test case targets a different pipeline stage to ensure filtering
892+
does not interfere with any detection path.
893+
"""
894+
result = chardet.detect(data, include_encodings=include_set, compat_names=False)
895+
assert result["encoding"] == expected, (
896+
f"expected={expected}, got={result['encoding']} "
897+
f"(confidence={result['confidence']:.2f})"
898+
)

0 commit comments

Comments
 (0)