🎨 Enable strict type check and improve the project typing (#207)

Ousret · web-flow · commit f955341352e9 · 2022-08-14T19:46:28.000+02:00
Following #182
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -28,7 +28,7 @@ jobs:
         python setup.py install
     - name: Type checking (Mypy)
       run: |
-        mypy charset_normalizer
+        mypy --strict charset_normalizer
     - name: Import sorting check (isort)
       run: |
         isort --check charset_normalizer
diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py
@@ -1,7 +1,7 @@
 import logging
 from os import PathLike
 from os.path import basename, splitext
-from typing import BinaryIO, List, Optional, Set
+from typing import Any, BinaryIO, List, Optional, Set
 
 from .cd import (
     coherence_ratio,
@@ -36,8 +36,8 @@ def from_bytes(
     steps: int = 5,
     chunk_size: int = 512,
     threshold: float = 0.2,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
     preemptive_behaviour: bool = True,
     explain: bool = False,
 ) -> CharsetMatches:
@@ -486,8 +486,8 @@ def from_fp(
     steps: int = 5,
     chunk_size: int = 512,
     threshold: float = 0.20,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
     preemptive_behaviour: bool = True,
     explain: bool = False,
 ) -> CharsetMatches:
@@ -508,12 +508,12 @@ def from_fp(
 
 
 def from_path(
-    path: PathLike,
+    path: "PathLike[Any]",
     steps: int = 5,
     chunk_size: int = 512,
     threshold: float = 0.20,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
     preemptive_behaviour: bool = True,
     explain: bool = False,
 ) -> CharsetMatches:
@@ -535,12 +535,12 @@ def from_path(
 
 
 def normalize(
-    path: PathLike,
+    path: "PathLike[Any]",
     steps: int = 5,
     chunk_size: int = 512,
     threshold: float = 0.20,
-    cp_isolation: List[str] = None,
-    cp_exclusion: List[str] = None,
+    cp_isolation: Optional[List[str]] = None,
+    cp_exclusion: Optional[List[str]] = None,
     preemptive_behaviour: bool = True,
 ) -> CharsetMatch:
     """
diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py
@@ -2,7 +2,7 @@
 from codecs import IncrementalDecoder
 from collections import Counter
 from functools import lru_cache
-from typing import Dict, List, Optional, Tuple
+from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
 
 from .assets import FREQUENCIES
 from .constant import KO_NAMES, LANGUAGE_SUPPORTED_COUNT, TOO_SMALL_SEQUENCE, ZH_NAMES
@@ -24,7 +24,9 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
     if is_multi_byte_encoding(iana_name):
         raise IOError("Function not supported on multi-byte code page")
 
-    decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder  # type: ignore
+    decoder = importlib.import_module(
+        "encodings.{}".format(iana_name)
+    ).IncrementalDecoder
 
     p: IncrementalDecoder = decoder(errors="ignore")
     seen_ranges: Dict[str, int] = {}
@@ -307,7 +309,7 @@ def coherence_ratio(
         lg_inclusion_list.remove("Latin Based")
 
     for layer in alpha_unicode_split(decoded_sequence):
-        sequence_frequencies: Counter = Counter(layer)
+        sequence_frequencies: TypeCounter[str] = Counter(layer)
         most_common = sequence_frequencies.most_common()
 
         character_count: int = sum(o for c, o in most_common)
diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py
@@ -3,7 +3,7 @@
 from json import dumps
 from os.path import abspath
 from platform import python_version
-from typing import List
+from typing import List, Optional
 
 try:
     from unicodedata2 import unidata_version
@@ -48,7 +48,7 @@ def query_yes_no(question: str, default: str = "yes") -> bool:
             sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
 
 
-def cli_detect(argv: List[str] = None) -> int:
+def cli_detect(argv: Optional[List[str]] = None) -> int:
     """
     CLI assistant using ARGV and ArgumentParser
     :param argv:
diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py
@@ -4,7 +4,16 @@
 from hashlib import sha256
 from json import dumps
 from re import sub
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    Counter as TypeCounter,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 
 from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE
 from .md import mess_ratio
@@ -95,7 +104,7 @@ def coherence_non_latin(self) -> float:
         return 0.0
 
     @property
-    def w_counter(self) -> Counter:
+    def w_counter(self) -> TypeCounter[str]:
         """
         Word counter instance on decoded text.
         Notice: Will be removed in 3.0
@@ -280,7 +289,7 @@ class CharsetMatches:
     Act like a list(iterable) but does not implements all related methods.
     """
 
-    def __init__(self, results: List[CharsetMatch] = None):
+    def __init__(self, results: Optional[List[CharsetMatch]] = None):
         self._results: List[CharsetMatch] = sorted(results) if results else []
 
     def __iter__(self) -> Iterator[CharsetMatch]:
diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py
@@ -13,7 +13,7 @@
 from re import findall
 from typing import Generator, List, Optional, Set, Tuple, Union
 
-from _multibytecodec import MultibyteIncrementalDecoder  # type: ignore
+from _multibytecodec import MultibyteIncrementalDecoder
 
 from .constant import (
     ENCODING_MARKS,
@@ -231,6 +231,9 @@ def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional
     for specified_encoding in results:
         specified_encoding = specified_encoding.lower().replace("-", "_")
 
+        encoding_alias: str
+        encoding_iana: str
+
         for encoding_alias, encoding_iana in aliases.items():
             if encoding_alias == specified_encoding:
                 return encoding_iana
@@ -256,7 +259,7 @@ def is_multi_byte_encoding(name: str) -> bool:
         "utf_32_be",
         "utf_7",
     } or issubclass(
-        importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,  # type: ignore
+        importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
         MultibyteIncrementalDecoder,
     )
 
@@ -286,6 +289,9 @@ def should_strip_sig_or_bom(iana_encoding: str) -> bool:
 def iana_name(cp_name: str, strict: bool = True) -> str:
     cp_name = cp_name.lower().replace("-", "_")
 
+    encoding_alias: str
+    encoding_iana: str
+
     for encoding_alias, encoding_iana in aliases.items():
         if cp_name in [encoding_alias, encoding_iana]:
             return encoding_iana
@@ -315,8 +321,12 @@ def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
     if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
         return 0.0
 
-    decoder_a = importlib.import_module("encodings.{}".format(iana_name_a)).IncrementalDecoder  # type: ignore
-    decoder_b = importlib.import_module("encodings.{}".format(iana_name_b)).IncrementalDecoder  # type: ignore
+    decoder_a = importlib.import_module(
+        "encodings.{}".format(iana_name_a)
+    ).IncrementalDecoder
+    decoder_b = importlib.import_module(
+        "encodings.{}".format(iana_name_b)
+    ).IncrementalDecoder
 
     id_a: IncrementalDecoder = decoder_a(errors="ignore")
     id_b: IncrementalDecoder = decoder_b(errors="ignore")