@@ -806,3 +806,93 @@ def test_detect_all_custom_no_match_encoding():
806806 compat_names = False ,
807807 )
808808 assert results [0 ]["encoding" ] == "ascii"
809+
810+
811+ # --- include_encodings accuracy preservation tests ---
812+
813+
814+ @pytest .mark .parametrize (
815+ ("data" , "include_set" , "expected" ),
816+ [
817+ # UTF-8 stage: multibyte UTF-8 with Latin single-byte confusables
818+ pytest .param (
819+ "Héllo wörld café résumé naïve über straße" .encode (),
820+ ["utf-8" , "cp1252" , "iso8859-1" ],
821+ "utf-8" ,
822+ id = "utf8-with-latin-confusables" ,
823+ ),
824+ # BOM stage: UTF-8 BOM with alternatives present
825+ pytest .param (
826+ "\ufeff Hello world, this is a BOM test document." .encode ("utf-8-sig" ),
827+ ["utf-8-sig" , "utf-8" , "cp1252" ],
828+ "utf-8-sig" ,
829+ id = "utf8-bom-with-alternatives" ,
830+ ),
831+ # Escape stage: ISO-2022-JP with other Japanese encodings
832+ pytest .param (
833+ "こんにちは世界、これはテストです。" .encode ("iso2022_jp" ),
834+ ["iso2022_jp_2" , "utf-8" , "euc_jis_2004" ],
835+ "iso2022_jp_2" ,
836+ id = "iso2022-jp-with-alternatives" ,
837+ ),
838+ # UTF-16 stage: BOM-less UTF-16-LE with endian confusable
839+ pytest .param (
840+ "Hello world, this is a longer UTF-16 test with café." .encode ("utf-16-le" ),
841+ ["utf-16-le" , "utf-16-be" , "utf-8" ],
842+ "utf-16-le" ,
843+ id = "utf16-le-with-endian-confusable" ,
844+ ),
845+ # Statistical stage: Cyrillic (windows-1251) with Cyrillic confusables
846+ pytest .param (
847+ (
848+ "Привет мир, как дела? Это тестовый текст на русском языке. "
849+ "Москва — столица России, крупнейший город страны."
850+ ).encode ("windows-1251" ),
851+ ["cp1251" , "cp1252" , "iso8859-5" , "koi8-r" ],
852+ "cp1251" ,
853+ id = "windows-1251-with-cyrillic-confusables" ,
854+ ),
855+ # Statistical stage: Greek with Latin confusables
856+ pytest .param (
857+ (
858+ "Η Αθήνα είναι η πρωτεύουσα και μεγαλύτερη πόλη της Ελλάδας. "
859+ "Η πόλη έχει μακρά ιστορία που εκτείνεται πάνω από τρεις χιλιετίες."
860+ ).encode ("iso-8859-7" ),
861+ ["iso8859-7" , "cp1252" , "iso8859-1" ],
862+ "iso8859-7" ,
863+ id = "greek-with-latin-confusables" ,
864+ ),
865+ # Structural + Statistical: Japanese Shift-JIS with CJK confusables
866+ pytest .param (
867+ (
868+ "これはテストです。日本語のテキスト。東京は日本の首都です。"
869+ "人口は約1400万人で、世界最大の都市圏を形成しています。"
870+ ).encode ("shift_jis" ),
871+ ["shift_jis_2004" , "euc_jis_2004" , "gb18030" , "big5hkscs" ],
872+ "shift_jis_2004" ,
873+ id = "shift-jis-with-cjk-confusables" ,
874+ ),
875+ # Structural + Statistical: Chinese GB18030 with CJK confusables
876+ pytest .param (
877+ (
878+ "这是中文测试文本,用于检测编码。北京是中国的首都,上海是最大的城市。"
879+ ).encode ("gb18030" ),
880+ ["gb18030" , "big5hkscs" , "euc_kr" ],
881+ "gb18030" ,
882+ id = "gb18030-with-cjk-confusables" ,
883+ ),
884+ ],
885+ )
886+ def test_include_encodings_preserves_accuracy (
887+ data : bytes , include_set : list [str ], expected : str
888+ ) -> None :
889+ """include_encodings must not degrade accuracy when the correct encoding is present.
890+
891+ Each test case targets a different pipeline stage to ensure filtering
892+ does not interfere with any detection path.
893+ """
894+ result = chardet .detect (data , include_encodings = include_set , compat_names = False )
895+ assert result ["encoding" ] == expected , (
896+ f"expected={ expected } , got={ result ['encoding' ]} "
897+ f"(confidence={ result ['confidence' ]:.2f} )"
898+ )
0 commit comments