Skip to content

Commit 58c93ff

Browse files
authored
🎨 Replace AST typing to the native syntax (3.6+) (#193)
1 parent 6ac98eb commit 58c93ff

7 files changed

Lines changed: 183 additions & 189 deletions

File tree

charset_normalizer/api.py

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -67,11 +67,11 @@ def from_bytes(
6767
)
6868

6969
if explain:
70-
previous_logger_level = logger.level # type: int
70+
previous_logger_level: int = logger.level
7171
logger.addHandler(explain_handler)
7272
logger.setLevel(TRACE)
7373

74-
length = len(sequences) # type: int
74+
length: int = len(sequences)
7575

7676
if length == 0:
7777
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
@@ -116,8 +116,8 @@ def from_bytes(
116116
if steps > 1 and length / steps < chunk_size:
117117
chunk_size = int(length / steps)
118118

119-
is_too_small_sequence = len(sequences) < TOO_SMALL_SEQUENCE # type: bool
120-
is_too_large_sequence = len(sequences) >= TOO_BIG_SEQUENCE # type: bool
119+
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
120+
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
121121

122122
if is_too_small_sequence:
123123
logger.log(
@@ -134,11 +134,11 @@ def from_bytes(
134134
),
135135
)
136136

137-
prioritized_encodings = [] # type: List[str]
137+
prioritized_encodings: List[str] = []
138138

139-
specified_encoding = (
139+
specified_encoding: Optional[str] = (
140140
any_specified_encoding(sequences) if preemptive_behaviour else None
141-
) # type: Optional[str]
141+
)
142142

143143
if specified_encoding is not None:
144144
prioritized_encodings.append(specified_encoding)
@@ -148,15 +148,15 @@ def from_bytes(
148148
specified_encoding,
149149
)
150150

151-
tested = set() # type: Set[str]
152-
tested_but_hard_failure = [] # type: List[str]
153-
tested_but_soft_failure = [] # type: List[str]
151+
tested: Set[str] = set()
152+
tested_but_hard_failure: List[str] = []
153+
tested_but_soft_failure: List[str] = []
154154

155-
fallback_ascii = None # type: Optional[CharsetMatch]
156-
fallback_u8 = None # type: Optional[CharsetMatch]
157-
fallback_specified = None # type: Optional[CharsetMatch]
155+
fallback_ascii: Optional[CharsetMatch] = None
156+
fallback_u8: Optional[CharsetMatch] = None
157+
fallback_specified: Optional[CharsetMatch] = None
158158

159-
results = CharsetMatches() # type: CharsetMatches
159+
results: CharsetMatches = CharsetMatches()
160160

161161
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
162162

@@ -187,11 +187,11 @@ def from_bytes(
187187

188188
tested.add(encoding_iana)
189189

190-
decoded_payload = None # type: Optional[str]
191-
bom_or_sig_available = sig_encoding == encoding_iana # type: bool
192-
strip_sig_or_bom = bom_or_sig_available and should_strip_sig_or_bom(
190+
decoded_payload: Optional[str] = None
191+
bom_or_sig_available: bool = sig_encoding == encoding_iana
192+
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
193193
encoding_iana
194-
) # type: bool
194+
)
195195

196196
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
197197
logger.log(
@@ -202,7 +202,7 @@ def from_bytes(
202202
continue
203203

204204
try:
205-
is_multi_byte_decoder = is_multi_byte_encoding(encoding_iana) # type: bool
205+
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
206206
except (ModuleNotFoundError, ImportError):
207207
logger.log(
208208
TRACE,
@@ -237,7 +237,7 @@ def from_bytes(
237237
tested_but_hard_failure.append(encoding_iana)
238238
continue
239239

240-
similar_soft_failure_test = False # type: bool
240+
similar_soft_failure_test: bool = False
241241

242242
for encoding_soft_failed in tested_but_soft_failure:
243243
if is_cp_similar(encoding_iana, encoding_soft_failed):
@@ -259,11 +259,11 @@ def from_bytes(
259259
int(length / steps),
260260
)
261261

262-
multi_byte_bonus = (
262+
multi_byte_bonus: bool = (
263263
is_multi_byte_decoder
264264
and decoded_payload is not None
265265
and len(decoded_payload) < length
266-
) # type: bool
266+
)
267267

268268
if multi_byte_bonus:
269269
logger.log(
@@ -273,13 +273,13 @@ def from_bytes(
273273
encoding_iana,
274274
)
275275

276-
max_chunk_gave_up = int(len(r_) / 4) # type: int
276+
max_chunk_gave_up: int = int(len(r_) / 4)
277277

278278
max_chunk_gave_up = max(max_chunk_gave_up, 2)
279-
early_stop_count = 0 # type: int
279+
early_stop_count: int = 0
280280
lazy_str_hard_failure = False
281281

282-
md_chunks = [] # type: List[str]
282+
md_chunks: List[str] = []
283283
md_ratios = []
284284

285285
try:
@@ -334,9 +334,7 @@ def from_bytes(
334334
tested_but_hard_failure.append(encoding_iana)
335335
continue
336336

337-
mean_mess_ratio = (
338-
sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
339-
) # type: float
337+
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
340338
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
341339
tested_but_soft_failure.append(encoding_iana)
342340
logger.log(
@@ -371,7 +369,7 @@ def from_bytes(
371369
)
372370

373371
if not is_multi_byte_decoder:
374-
target_languages = encoding_languages(encoding_iana) # type: List[str]
372+
target_languages: List[str] = encoding_languages(encoding_iana)
375373
else:
376374
target_languages = mb_encoding_languages(encoding_iana)
377375

charset_normalizer/cd.py

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,15 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
2626

2727
decoder = importlib.import_module("encodings.{}".format(iana_name)).IncrementalDecoder # type: ignore
2828

29-
p = decoder(errors="ignore") # type: IncrementalDecoder
30-
seen_ranges = {} # type: Dict[str, int]
31-
character_count = 0 # type: int
29+
p: IncrementalDecoder = decoder(errors="ignore")
30+
seen_ranges: Dict[str, int] = {}
31+
character_count: int = 0
3232

3333
for i in range(0x40, 0xFF):
34-
chunk = p.decode(bytes([i])) # type: str
34+
chunk: str = p.decode(bytes([i]))
3535

3636
if chunk:
37-
character_range = unicode_range(chunk) # type: Optional[str]
37+
character_range: Optional[str] = unicode_range(chunk)
3838

3939
if character_range is None:
4040
continue
@@ -58,7 +58,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
5858
"""
5959
Return inferred languages used with a unicode range.
6060
"""
61-
languages = [] # type: List[str]
61+
languages: List[str] = []
6262

6363
for language, characters in FREQUENCIES.items():
6464
for character in characters:
@@ -75,8 +75,8 @@ def encoding_languages(iana_name: str) -> List[str]:
7575
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
7676
This function does the correspondence.
7777
"""
78-
unicode_ranges = encoding_unicode_range(iana_name) # type: List[str]
79-
primary_range = None # type: Optional[str]
78+
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
79+
primary_range: Optional[str] = None
8080

8181
for specified_range in unicode_ranges:
8282
if "Latin" not in specified_range:
@@ -115,8 +115,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
115115
"""
116116
Determine main aspects from a supported language if it contains accents and if is pure Latin.
117117
"""
118-
target_have_accents = False # type: bool
119-
target_pure_latin = True # type: bool
118+
target_have_accents: bool = False
119+
target_pure_latin: bool = True
120120

121121
for character in FREQUENCIES[language]:
122122
if not target_have_accents and is_accentuated(character):
@@ -133,7 +133,7 @@ def alphabet_languages(
133133
"""
134134
Return associated languages associated to given characters.
135135
"""
136-
languages = [] # type: List[Tuple[str, float]]
136+
languages: List[Tuple[str, float]] = []
137137

138138
source_have_accents = any(is_accentuated(character) for character in characters)
139139

@@ -147,13 +147,13 @@ def alphabet_languages(
147147
if target_have_accents is False and source_have_accents:
148148
continue
149149

150-
character_count = len(language_characters) # type: int
150+
character_count: int = len(language_characters)
151151

152-
character_match_count = len(
152+
character_match_count: int = len(
153153
[c for c in language_characters if c in characters]
154-
) # type: int
154+
)
155155

156-
ratio = character_match_count / character_count # type: float
156+
ratio: float = character_match_count / character_count
157157

158158
if ratio >= 0.2:
159159
languages.append((language, ratio))
@@ -174,33 +174,33 @@ def characters_popularity_compare(
174174
if language not in FREQUENCIES:
175175
raise ValueError("{} not available".format(language))
176176

177-
character_approved_count = 0 # type: int
177+
character_approved_count: int = 0
178178
FREQUENCIES_language_set = set(FREQUENCIES[language])
179179

180180
for character in ordered_characters:
181181
if character not in FREQUENCIES_language_set:
182182
continue
183183

184-
characters_before_source = FREQUENCIES[language][
184+
characters_before_source: List[str] = FREQUENCIES[language][
185185
0 : FREQUENCIES[language].index(character)
186-
] # type: List[str]
187-
characters_after_source = FREQUENCIES[language][
186+
]
187+
characters_after_source: List[str] = FREQUENCIES[language][
188188
FREQUENCIES[language].index(character) :
189-
] # type: List[str]
190-
characters_before = ordered_characters[
189+
]
190+
characters_before: List[str] = ordered_characters[
191191
0 : ordered_characters.index(character)
192-
] # type: List[str]
193-
characters_after = ordered_characters[
192+
]
193+
characters_after: List[str] = ordered_characters[
194194
ordered_characters.index(character) :
195-
] # type: List[str]
195+
]
196196

197-
before_match_count = len(
197+
before_match_count: int = len(
198198
set(characters_before) & set(characters_before_source)
199-
) # type: int
199+
)
200200

201-
after_match_count = len(
201+
after_match_count: int = len(
202202
set(characters_after) & set(characters_after_source)
203-
) # type: int
203+
)
204204

205205
if len(characters_before_source) == 0 and before_match_count <= 4:
206206
character_approved_count += 1
@@ -232,12 +232,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
232232
if character.isalpha() is False:
233233
continue
234234

235-
character_range = unicode_range(character) # type: Optional[str]
235+
character_range: Optional[str] = unicode_range(character)
236236

237237
if character_range is None:
238238
continue
239239

240-
layer_target_range = None # type: Optional[str]
240+
layer_target_range: Optional[str] = None
241241

242242
for discovered_range in layers:
243243
if (
@@ -296,33 +296,33 @@ def coherence_ratio(
296296
A layer = Character extraction by alphabets/ranges.
297297
"""
298298

299-
results = [] # type: List[Tuple[str, float]]
300-
ignore_non_latin = False # type: bool
299+
results: List[Tuple[str, float]] = []
300+
ignore_non_latin: bool = False
301301

302-
sufficient_match_count = 0 # type: int
302+
sufficient_match_count: int = 0
303303

304304
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
305305
if "Latin Based" in lg_inclusion_list:
306306
ignore_non_latin = True
307307
lg_inclusion_list.remove("Latin Based")
308308

309309
for layer in alpha_unicode_split(decoded_sequence):
310-
sequence_frequencies = Counter(layer) # type: Counter
310+
sequence_frequencies: Counter = Counter(layer)
311311
most_common = sequence_frequencies.most_common()
312312

313-
character_count = sum(o for c, o in most_common) # type: int
313+
character_count: int = sum(o for c, o in most_common)
314314

315315
if character_count <= TOO_SMALL_SEQUENCE:
316316
continue
317317

318-
popular_character_ordered = [c for c, o in most_common] # type: List[str]
318+
popular_character_ordered: List[str] = [c for c, o in most_common]
319319

320320
for language in lg_inclusion_list or alphabet_languages(
321321
popular_character_ordered, ignore_non_latin
322322
):
323-
ratio = characters_popularity_compare(
323+
ratio: float = characters_popularity_compare(
324324
language, popular_character_ordered
325-
) # type: float
325+
)
326326

327327
if ratio < threshold:
328328
continue

charset_normalizer/cli/normalizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def cli_detect(argv: List[str] = None) -> int:
229229
my_file.close()
230230
continue
231231

232-
o_ = my_file.name.split(".") # type: List[str]
232+
o_: List[str] = my_file.name.split(".")
233233

234234
if args.replace is False:
235235
o_.insert(-1, best_guess.encoding)

0 commit comments

Comments
 (0)