Skip to content

Commit f955341

Browse files
authored
🎨 Enable strict type check and improve the project typing (#207)
Following #182
1 parent 6155b6b commit f955341

6 files changed

Lines changed: 45 additions & 24 deletions

File tree

.github/workflows/lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
python setup.py install
2929
- name: Type checking (Mypy)
3030
run: |
31-
mypy charset_normalizer
31+
mypy --strict charset_normalizer
3232
- name: Import sorting check (isort)
3333
run: |
3434
isort --check charset_normalizer

charset_normalizer/api.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
from os import PathLike
33
from os.path import basename, splitext
4-
from typing import BinaryIO, List, Optional, Set
4+
from typing import Any, BinaryIO, List, Optional, Set
55

66
from .cd import (
77
coherence_ratio,
@@ -36,8 +36,8 @@ def from_bytes(
3636
steps: int = 5,
3737
chunk_size: int = 512,
3838
threshold: float = 0.2,
39-
cp_isolation: List[str] = None,
40-
cp_exclusion: List[str] = None,
39+
cp_isolation: Optional[List[str]] = None,
40+
cp_exclusion: Optional[List[str]] = None,
4141
preemptive_behaviour: bool = True,
4242
explain: bool = False,
4343
) -> CharsetMatches:
@@ -486,8 +486,8 @@ def from_fp(
486486
steps: int = 5,
487487
chunk_size: int = 512,
488488
threshold: float = 0.20,
489-
cp_isolation: List[str] = None,
490-
cp_exclusion: List[str] = None,
489+
cp_isolation: Optional[List[str]] = None,
490+
cp_exclusion: Optional[List[str]] = None,
491491
preemptive_behaviour: bool = True,
492492
explain: bool = False,
493493
) -> CharsetMatches:
@@ -508,12 +508,12 @@ def from_fp(
508508

509509

510510
def from_path(
511-
path: PathLike,
511+
path: "PathLike[Any]",
512512
steps: int = 5,
513513
chunk_size: int = 512,
514514
threshold: float = 0.20,
515-
cp_isolation: List[str] = None,
516-
cp_exclusion: List[str] = None,
515+
cp_isolation: Optional[List[str]] = None,
516+
cp_exclusion: Optional[List[str]] = None,
517517
preemptive_behaviour: bool = True,
518518
explain: bool = False,
519519
) -> CharsetMatches:
@@ -535,12 +535,12 @@ def from_path(
535535

536536

537537
def normalize(
538-
path: PathLike,
538+
path: "PathLike[Any]",
539539
steps: int = 5,
540540
chunk_size: int = 512,
541541
threshold: float = 0.20,
542-
cp_isolation: List[str] = None,
543-
cp_exclusion: List[str] = None,
542+
cp_isolation: Optional[List[str]] = None,
543+
cp_exclusion: Optional[List[str]] = None,
544544
preemptive_behaviour: bool = True,
545545
) -> CharsetMatch:
546546
"""

charset_normalizer/cd.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from codecs import IncrementalDecoder
33
from collections import Counter
44
from functools import lru_cache
5-
from typing import Dict, List, Optional, Tuple
5+
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
66

77
from .assets import FREQUENCIES
88
from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
@@ -24,7 +24,9 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
2424
if is_multi_byte_encoding(iana_name):
2525
raise IOError("Function not supported on multi-byte code page")
2626

27-
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore
27+
decoder = importlib.import_module(
28+
"encodings.{}".format(iana_name)
29+
).IncrementalDecoder
2830

2931
p: IncrementalDecoder = decoder(errors="ignore")
3032
seen_ranges: Dict[str, int] = {}
@@ -307,7 +309,7 @@ def coherence_ratio(
307309
lg_inclusion_list.remove("Latin Based")
308310

309311
for layer in alpha_unicode_split(decoded_sequence):
310-
sequence_frequencies: Counter = Counter(layer)
312+
sequence_frequencies: TypeCounter[str] = Counter(layer)
311313
most_common = sequence_frequencies.most_common()
312314

313315
character_count: int = sum(o for c, o in most_common)

charset_normalizer/cli/normalizer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from json import dumps
44
from os.path import abspath
55
from platform import python_version
6-
from typing import List
6+
from typing import List, Optional
77

88
try:
99
from unicodedata2 import unidata_version
@@ -48,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
4848
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
4949

5050

51-
def cli_detect(argv: List[str] = None) -> int:
51+
def cli_detect(argv: Optional[List[str]] = None) -> int:
5252
"""
5353
CLI assistant using ARGV and ArgumentParser
5454
:param argv:

charset_normalizer/models.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,16 @@
44
from hashlib import sha256
55
from json import dumps
66
from re import sub
7-
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
7+
from typing import (
8+
Any,
9+
Counter as TypeCounter,
10+
Dict,
11+
Iterator,
12+
List,
13+
Optional,
14+
Tuple,
15+
Union,
16+
)
817

918
from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
1019
from .md import mess_ratio
@@ -95,7 +104,7 @@ def coherence_non_latin(self) -> float:
95104
return 0.0
96105

97106
@property
98-
def w_counter(self) -> Counter:
107+
def w_counter(self) -> TypeCounter[str]:
99108
"""
100109
Word counter instance on decoded text.
101110
Notice: Will be removed in 3.0
@@ -280,7 +289,7 @@ class CharsetMatches:
280289
Act like a list(iterable) but does not implements all related methods.
281290
"""
282291

283-
def __init__(self, results: List[CharsetMatch] = None):
292+
def __init__(self, results: Optional[List[CharsetMatch]] = None):
284293
self._results: List[CharsetMatch] = sorted(results) if results else []
285294

286295
def __iter__(self) -> Iterator[CharsetMatch]:

charset_normalizer/utils.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from re import findall
1414
from typing import Generator, List, Optional, Set, Tuple, Union
1515

16-
from _multibytecodec import MultibyteIncrementalDecoder # type: ignore
16+
from _multibytecodec import MultibyteIncrementalDecoder
1717

1818
from .constant import (
1919
ENCODING_MARKS,
@@ -231,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
231231
for specified_encoding in results:
232232
specified_encoding = specified_encoding.lower().replace("-", "_")
233233

234+
encoding_alias: str
235+
encoding_iana: str
236+
234237
for encoding_alias, encoding_iana in aliases.items():
235238
if encoding_alias == specified_encoding:
236239
return encoding_iana
@@ -256,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool:
256259
"utf_32_be",
257260
"utf_7",
258261
} or issubclass(
259-
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder, # type: ignore
262+
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
260263
MultibyteIncrementalDecoder,
261264
)
262265

@@ -286,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
286289
def iana_name(cp_name: str, strict: bool = True) -> str:
287290
cp_name = cp_name.lower().replace("-", "_")
288291

292+
encoding_alias: str
293+
encoding_iana: str
294+
289295
for encoding_alias, encoding_iana in aliases.items():
290296
if cp_name in [encoding_alias, encoding_iana]:
291297
return encoding_iana
@@ -315,8 +321,12 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
315321
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
316322
return 0.0
317323

318-
decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder # type: ignore
319-
decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder # type: ignore
324+
decoder_a = importlib.import_module(
325+
"encodings.{}".format(iana_name_a)
326+
).IncrementalDecoder
327+
decoder_b = importlib.import_module(
328+
"encodings.{}".format(iana_name_b)
329+
).IncrementalDecoder
320330

321331
id_a: IncrementalDecoder = decoder_a(errors="ignore")
322332
id_b: IncrementalDecoder = decoder_b(errors="ignore")

0 commit comments

Comments
 (0)