|
11 | 11 | from importlib import import_module |
12 | 12 | from inspect import currentframe |
13 | 13 | from itertools import chain, product |
| 14 | +from locale import getlocale |
14 | 15 | from math import log |
15 | 16 | from platform import system |
16 | 17 | from random import randint |
|
35 | 36 | "isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "is_native", |
36 | 37 | "list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", "register", |
37 | 38 | "remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro", |
38 | | - "DARWIN", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] |
| 39 | + "DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"] |
39 | 40 | CODECS_REGISTRY = None |
40 | 41 | CODECS_CATEGORIES = ["native", "custom"] |
| 42 | +LANG = getlocale()[0][:2].lower() |
41 | 43 | MASKS = { |
42 | 44 | 'a': printable, |
43 | 45 | 'b': "".join(chr(i) for i in range(256)), |
|
62 | 64 | UNIX = DARWIN or LINUX |
63 | 65 | WINDOWS = system() == "Windows" |
64 | 66 |
|
| 67 | +LANG_BACKEND = None |
| 68 | +for lib in ["langid", "langdetect", "pycld2", "cld3", "textblob"]: |
| 69 | + try: |
| 70 | + globals()[lib] = __import__(lib) |
| 71 | + LANG_BACKEND = lib |
| 72 | + break |
| 73 | + except ImportError: |
| 74 | + pass |
| 75 | +CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|hi|hm|hr|ht|hu|" \ |
| 76 | + "hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|ml|mn|mr|ms|mt|my|ne|nl|no|ny|" \ |
| 77 | + "pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu" \ |
| 78 | + .split("|") |
| 79 | +TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|id|is|it|iw|" \ |
| 80 | + "ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|te|th|tl|tr|uk|ur|vi|yi|zh" \ |
| 81 | + .split("|") |
65 | 82 |
|
66 | 83 | entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]]) |
67 | 84 |
|
@@ -199,25 +216,31 @@ def getregentry(encoding): |
199 | 216 | fenc, fdec, name = encode, decode, encoding |
200 | 217 | # prepare CodecInfo input arguments |
201 | 218 | if pattern: |
202 | | - m = re.match(pattern, encoding) |
| 219 | + m, args, i = re.match(pattern, encoding), [], 1 |
203 | 220 | try: |
204 | | - g = m.group(1) or "" |
205 | | - if g.isdigit(): |
206 | | - g = int(g) |
207 | | - fenc = fenc(g) if fenc else fenc |
208 | | - fdec = fdec(g) if fdec else fdec |
209 | | - except AttributeError: |
210 | | - # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match |
211 | | - if m is not None: |
212 | | - raise |
213 | | - return |
| 221 | + while True: |
| 222 | + try: |
| 223 | + g = m.group(i) or "" |
| 224 | + if g.isdigit(): |
| 225 | + g = int(g) |
| 226 | + args += [g] |
| 227 | + i += 1 |
| 228 | + except AttributeError: |
| 229 | + # this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match |
| 230 | + if m is not None: |
| 231 | + raise |
| 232 | + return |
214 | 233 | except IndexError: |
215 | | - # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; in |
216 | | - # this case, if fenc/fdec is a decorated function, execute it with no arg |
217 | | - if fenc and len(getfullargspec(fenc).args) == 1: |
218 | | - fenc = fenc() |
219 | | - if fdec and len(getfullargspec(fdec).args) == 1: |
220 | | - fdec = fdec() |
| 234 | + # this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; |
| 235 | + # in this case, if fenc/fdec is a decorated function, execute it with no arg |
| 236 | + if len(args) == 0: |
| 237 | + if fenc and len(getfullargspec(fenc).args) == 1: |
| 238 | + fenc = fenc() |
| 239 | + if fdec and len(getfullargspec(fdec).args) == 1: |
| 240 | + fdec = fdec() |
| 241 | + else: |
| 242 | + fenc = fenc(*args) if fenc else fenc |
| 243 | + fdec = fdec(*args) if fdec else fdec |
221 | 244 | if fenc: |
222 | 245 | fenc = fix_inout_formats(fenc) |
223 | 246 | if fdec: |
@@ -1056,31 +1079,54 @@ def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=R |
1056 | 1079 | - `printables`: checks that every output character is in the set of printables |
1057 | 1080 | """) |
1058 | 1081 | stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s)) |
1059 | | -stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None |
1060 | | -stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 |
| 1082 | +stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables" |
| 1083 | +stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None |
| 1084 | +stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex" |
| 1085 | +stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6 |
| 1086 | +stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text" |
| 1087 | +stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None |
| 1088 | +stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag" |
| 1089 | +stopfunc.default = stopfunc.printables |
| 1090 | + |
| 1091 | + |
| 1092 | +def _detect(text): |
| 1093 | + _lb, t = LANG_BACKEND, ensure_str(text) |
| 1094 | + if _lb is None: |
| 1095 | + raise ValueError("No language backend installed") |
| 1096 | + return langid.classify(t)[0] if _lb == "langid" else \ |
| 1097 | + langdetect.detect(t) if _lb == "langdetect" else \ |
| 1098 | + pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \ |
| 1099 | + cld3.get_language(t).language[:2] if _lb == "cld3" else \ |
| 1100 | + textblob.TextBlob(t).detect_language()[:2] |
| 1101 | + |
1061 | 1102 |
|
1062 | 1103 | def _lang(lang): |
1063 | 1104 | def _test(s): |
1064 | 1105 | if not stopfunc.text(s): |
1065 | 1106 | return False |
1066 | 1107 | try: |
1067 | | - return detect(ensure_str(s)) == lang |
| 1108 | + return _detect(ensure_str(s))[:2] == lang |
1068 | 1109 | except: |
1069 | 1110 | return False |
1070 | 1111 | return _test |
1071 | 1112 |
|
1072 | | -try: |
1073 | | - from langdetect import detect, PROFILES_DIRECTORY |
1074 | | - for lang in [p.replace("-", "") for p in os.listdir(PROFILES_DIRECTORY)]: |
1075 | | - setattr(stopfunc, "lang_%s" % lang, _lang(lang)) |
1076 | | -except ImportError: |
1077 | | - pass |
1078 | | - |
1079 | | - |
1080 | | -__flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", x) is not None |
1081 | | -def _flag(x): |
1082 | | - return __flag(ensure_str(x)) |
1083 | | -stopfunc.flag = _flag |
| 1113 | +if LANG_BACKEND: |
| 1114 | + _lb = LANG_BACKEND |
| 1115 | + if _lb == "langid": |
| 1116 | + langid.langid.load_model() |
| 1117 | + for lang in ( |
| 1118 | + langid.langid.identifier.nb_classes if _lb == "langid" else \ |
| 1119 | + [p.replace("-", "") for p in os.listdir(langdetect.PROFILES_DIRECTORY)] if _lb == "langdetect" else \ |
| 1120 | + list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \ |
| 1121 | + CLD3_LANGUAGES if _lb == "cld3" else \ |
| 1122 | + TEXTBLOB_LANGUAGES if _lb == "textblob" else \ |
| 1123 | + []): |
| 1124 | + n = "lang_%s" % lang |
| 1125 | + setattr(stopfunc, n, _lang(lang)) |
| 1126 | + getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n |
| 1127 | + flng = "lang_%s" % LANG |
| 1128 | + if getattr(stopfunc, flng, None): |
| 1129 | + stopfunc.default = getattr(stopfunc, flng) |
1084 | 1130 |
|
1085 | 1131 |
|
1086 | 1132 | def __develop(encodings): |
@@ -1140,7 +1186,7 @@ def expand(items, descr=None, transform=None): |
1140 | 1186 | if encoding in e: |
1141 | 1187 | continue |
1142 | 1188 | if debug: |
1143 | | - print("[*] Depth %d/%d ; trying %s" % (depth+1, max_depth, encoding)) |
| 1189 | + print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding)) |
1144 | 1190 | __guess(input, new_input, stop_func, depth+1, max_depth, min_depth, codec_categories, exclude, result, |
1145 | 1191 | found + (encoding, ), stop, show, scoring_heuristic, extended, debug) |
1146 | 1192 |
|
@@ -1236,7 +1282,7 @@ def __score(prev_input, input, codec, heuristic=False, extended=False): |
1236 | 1282 | yield s, new_input, encoding |
1237 | 1283 |
|
1238 | 1284 |
|
1239 | | -def guess(input, stop_func=stopfunc.printables, min_depth=0, max_depth=5, codec_categories=None, exclude=None, found=(), |
| 1285 | +def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, codec_categories=None, exclude=None, found=(), |
1240 | 1286 | stop=True, show=False, scoring_heuristic=False, extended=False, debug=False): |
1241 | 1287 | """ Try decoding without the knowledge of the encoding(s). """ |
1242 | 1288 | if max_depth <= 0: |
|
0 commit comments