@@ -26,15 +26,15 @@ def encoding_unicode_range(iana_name: str) -> List[str]:
2626
2727 decoder = importlib .import_module ("encodings.{}" .format (iana_name )).IncrementalDecoder # type: ignore
2828
29- p = decoder (errors = "ignore" ) # type: IncrementalDecoder
30- seen_ranges = {} # type : Dict[str, int]
31- character_count = 0 # type: int
29+ p : IncrementalDecoder = decoder (errors = "ignore" )
30+ seen_ranges : Dict [str , int ] = {}
31+ character_count : int = 0
3232
3333 for i in range (0x40 , 0xFF ):
34- chunk = p .decode (bytes ([i ])) # type: str
34+ chunk : str = p .decode (bytes ([i ]))
3535
3636 if chunk :
37- character_range = unicode_range (chunk ) # type: Optional[str]
37+ character_range : Optional [ str ] = unicode_range (chunk )
3838
3939 if character_range is None :
4040 continue
@@ -58,7 +58,7 @@ def unicode_range_languages(primary_range: str) -> List[str]:
5858 """
5959 Return inferred languages used with a unicode range.
6060 """
61- languages = [] # type: List[str ]
61+ languages : List [ str ] = [ ]
6262
6363 for language , characters in FREQUENCIES .items ():
6464 for character in characters :
@@ -75,8 +75,8 @@ def encoding_languages(iana_name: str) -> List[str]:
7575 Single-byte encoding language association. Some code page are heavily linked to particular language(s).
7676 This function does the correspondence.
7777 """
78- unicode_ranges = encoding_unicode_range (iana_name ) # type: List[str]
79- primary_range = None # type : Optional[str]
78+ unicode_ranges : List [ str ] = encoding_unicode_range (iana_name )
79+ primary_range : Optional [str ] = None
8080
8181 for specified_range in unicode_ranges :
8282 if "Latin" not in specified_range :
@@ -115,8 +115,8 @@ def get_target_features(language: str) -> Tuple[bool, bool]:
115115 """
116116 Determine main aspects from a supported language if it contains accents and if is pure Latin.
117117 """
118- target_have_accents = False # type: bool
119- target_pure_latin = True # type: bool
118+ target_have_accents : bool = False
119+ target_pure_latin : bool = True
120120
121121 for character in FREQUENCIES [language ]:
122122 if not target_have_accents and is_accentuated (character ):
@@ -133,7 +133,7 @@ def alphabet_languages(
133133 """
134134 Return associated languages associated to given characters.
135135 """
136- languages = [] # type : List[Tuple[str, float]]
136+ languages : List [Tuple [str , float ]] = [ ]
137137
138138 source_have_accents = any (is_accentuated (character ) for character in characters )
139139
@@ -147,13 +147,13 @@ def alphabet_languages(
147147 if target_have_accents is False and source_have_accents :
148148 continue
149149
150- character_count = len (language_characters ) # type: int
150+ character_count : int = len (language_characters )
151151
152- character_match_count = len (
152+ character_match_count : int = len (
153153 [c for c in language_characters if c in characters ]
154- ) # type: int
154+ )
155155
156- ratio = character_match_count / character_count # type: float
156+ ratio : float = character_match_count / character_count
157157
158158 if ratio >= 0.2 :
159159 languages .append ((language , ratio ))
@@ -174,33 +174,33 @@ def characters_popularity_compare(
174174 if language not in FREQUENCIES :
175175 raise ValueError ("{} not available" .format (language ))
176176
177- character_approved_count = 0 # type: int
177+ character_approved_count : int = 0
178178 FREQUENCIES_language_set = set (FREQUENCIES [language ])
179179
180180 for character in ordered_characters :
181181 if character not in FREQUENCIES_language_set :
182182 continue
183183
184- characters_before_source = FREQUENCIES [language ][
184+ characters_before_source : List [ str ] = FREQUENCIES [language ][
185185 0 : FREQUENCIES [language ].index (character )
186- ] # type: List[str]
187- characters_after_source = FREQUENCIES [language ][
186+ ]
187+ characters_after_source : List [ str ] = FREQUENCIES [language ][
188188 FREQUENCIES [language ].index (character ) :
189- ] # type: List[str]
190- characters_before = ordered_characters [
189+ ]
190+ characters_before : List [ str ] = ordered_characters [
191191 0 : ordered_characters .index (character )
192- ] # type: List[str]
193- characters_after = ordered_characters [
192+ ]
193+ characters_after : List [ str ] = ordered_characters [
194194 ordered_characters .index (character ) :
195- ] # type: List[str]
195+ ]
196196
197- before_match_count = len (
197+ before_match_count : int = len (
198198 set (characters_before ) & set (characters_before_source )
199- ) # type: int
199+ )
200200
201- after_match_count = len (
201+ after_match_count : int = len (
202202 set (characters_after ) & set (characters_after_source )
203- ) # type: int
203+ )
204204
205205 if len (characters_before_source ) == 0 and before_match_count <= 4 :
206206 character_approved_count += 1
@@ -232,12 +232,12 @@ def alpha_unicode_split(decoded_sequence: str) -> List[str]:
232232 if character .isalpha () is False :
233233 continue
234234
235- character_range = unicode_range (character ) # type: Optional[str]
235+ character_range : Optional [ str ] = unicode_range (character )
236236
237237 if character_range is None :
238238 continue
239239
240- layer_target_range = None # type : Optional[str]
240+ layer_target_range : Optional [str ] = None
241241
242242 for discovered_range in layers :
243243 if (
@@ -296,33 +296,33 @@ def coherence_ratio(
296296 A layer = Character extraction by alphabets/ranges.
297297 """
298298
299- results = [] # type : List[Tuple[str, float]]
300- ignore_non_latin = False # type: bool
299+ results : List [Tuple [str , float ]] = [ ]
300+ ignore_non_latin : bool = False
301301
302- sufficient_match_count = 0 # type: int
302+ sufficient_match_count : int = 0
303303
304304 lg_inclusion_list = lg_inclusion .split ("," ) if lg_inclusion is not None else []
305305 if "Latin Based" in lg_inclusion_list :
306306 ignore_non_latin = True
307307 lg_inclusion_list .remove ("Latin Based" )
308308
309309 for layer in alpha_unicode_split (decoded_sequence ):
310- sequence_frequencies = Counter (layer ) # type: Counter
310+ sequence_frequencies : Counter = Counter (layer )
311311 most_common = sequence_frequencies .most_common ()
312312
313- character_count = sum (o for c , o in most_common ) # type: int
313+ character_count : int = sum (o for c , o in most_common )
314314
315315 if character_count <= TOO_SMALL_SEQUENCE :
316316 continue
317317
318- popular_character_ordered = [c for c , o in most_common ] # type: List[str ]
318+ popular_character_ordered : List [ str ] = [c for c , o in most_common ]
319319
320320 for language in lg_inclusion_list or alphabet_languages (
321321 popular_character_ordered , ignore_non_latin
322322 ):
323- ratio = characters_popularity_compare (
323+ ratio : float = characters_popularity_compare (
324324 language , popular_character_ordered
325- ) # type: float
325+ )
326326
327327 if ratio < threshold :
328328 continue
0 commit comments