Skip to content

Commit bfd0ee8

Browse files
committed
Added language detection backends
1 parent 351fa75 commit bfd0ee8

1 file changed

Lines changed: 81 additions & 35 deletions

File tree

codext/__common__.py

Lines changed: 81 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from importlib import import_module
1212
from inspect import currentframe
1313
from itertools import chain, product
14+
from locale import getlocale
1415
from math import log
1516
from platform import system
1617
from random import randint
@@ -35,9 +36,10 @@
3536
"isb", "generate_strings_from_regex", "get_alphabet_from_mask", "handle_error", "is_native",
3637
"list_categories", "list_encodings", "list_macros", "lookup", "maketrans", "os", "rank", "re", "register",
3738
"remove", "reset", "s2i", "search", "stopfunc", "BytesIO", "_input", "_stripl", "CodecMacro",
38-
"DARWIN", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"]
39+
"DARWIN", "LANG", "LINUX", "MASKS", "PY3", "UNIX", "WINDOWS"]
3940
CODECS_REGISTRY = None
4041
CODECS_CATEGORIES = ["native", "custom"]
42+
LANG = getlocale()[0][:2].lower()
4143
MASKS = {
4244
'a': printable,
4345
'b': "".join(chr(i) for i in range(256)),
@@ -62,6 +64,21 @@
6264
UNIX = DARWIN or LINUX
6365
WINDOWS = system() == "Windows"
6466

67+
LANG_BACKEND = None
68+
for lib in ["langid", "langdetect", "pycld2", "cld3", "textblob"]:
69+
try:
70+
globals()[lib] = __import__(lib)
71+
LANG_BACKEND = lib
72+
break
73+
except ImportError:
74+
pass
75+
CLD3_LANGUAGES = "af|am|ar|bg|bn|bs|ca|ce|co|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|fy|ga|gd|gl|gu|ha|hi|hm|hr|ht|hu|" \
76+
"hy|id|ig|is|it|iw|ja|jv|ka|kk|km|kn|ko|ku|ky|la|lb|lo|lt|lv|mg|mi|mk|ml|mn|mr|ms|mt|my|ne|nl|no|ny|" \
77+
"pa|pl|ps|pt|ro|ru|sd|si|sk|sl|sm|sn|so|sq|sr|st|su|sv|sw|ta|te|tg|th|tr|uk|ur|uz|vi|xh|yi|yo|zh|zu" \
78+
.split("|")
79+
TEXTBLOB_LANGUAGES = "af|ar|az|be|bg|bn|ca|cs|cy|da|de|el|en|eo|es|et|eu|fa|fi|fr|ga|gl|gu|hi|hr|ht|hu|id|is|it|iw|" \
80+
"ja|ka|kn|ko|la|lt|lv|mk|ms|mt|nl|no|pl|pt|ro|ru|sk|sl|sq|sr|sv|sw|ta|te|th|tl|tr|uk|ur|vi|yi|zh" \
81+
.split("|")
6582

6683
entropy = lambda s: -sum([p * log(p, 2) for p in [float(s.count(c)) / len(s) for c in set(s)]])
6784

@@ -199,25 +216,31 @@ def getregentry(encoding):
199216
fenc, fdec, name = encode, decode, encoding
200217
# prepare CodecInfo input arguments
201218
if pattern:
202-
m = re.match(pattern, encoding)
219+
m, args, i = re.match(pattern, encoding), [], 1
203220
try:
204-
g = m.group(1) or ""
205-
if g.isdigit():
206-
g = int(g)
207-
fenc = fenc(g) if fenc else fenc
208-
fdec = fdec(g) if fdec else fdec
209-
except AttributeError:
210-
# this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match
211-
if m is not None:
212-
raise
213-
return
221+
while True:
222+
try:
223+
g = m.group(i) or ""
224+
if g.isdigit():
225+
g = int(g)
226+
args += [g]
227+
i += 1
228+
except AttributeError:
229+
# this occurs when m is None or there is an error in fenc(g) or fdec(g), meaning no match
230+
if m is not None:
231+
raise
232+
return
214233
except IndexError:
215-
# this occurs while m is not None, but possibly no capture group that gives at least 1 group index ; in
216-
# this case, if fenc/fdec is a decorated function, execute it with no arg
217-
if fenc and len(getfullargspec(fenc).args) == 1:
218-
fenc = fenc()
219-
if fdec and len(getfullargspec(fdec).args) == 1:
220-
fdec = fdec()
234+
# this occurs while m is not None, but possibly no capture group that gives at least 1 group index ;
235+
# in this case, if fenc/fdec is a decorated function, execute it with no arg
236+
if len(args) == 0:
237+
if fenc and len(getfullargspec(fenc).args) == 1:
238+
fenc = fenc()
239+
if fdec and len(getfullargspec(fdec).args) == 1:
240+
fdec = fdec()
241+
else:
242+
fenc = fenc(*args) if fenc else fenc
243+
fdec = fdec(*args) if fdec else fdec
221244
if fenc:
222245
fenc = fix_inout_formats(fenc)
223246
if fdec:
@@ -1056,31 +1079,54 @@ def generate_strings_from_regex(regex, star_plus_max=STAR_PLUS_MAX, repeat_max=R
10561079
- `printables`: checks that every output character is in the set of printables
10571080
""")
10581081
stopfunc.printables = lambda s: all(c in printable for c in ensure_str(s))
1059-
stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None
1060-
stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6
1082+
stopfunc.printables.__name__ = stopfunc.printables.__qualname__ = "printables"
1083+
stopfunc.regex = lambda p: lambda s: re.search(p, ensure_str(s)) is not None
1084+
stopfunc.regex.__name__ = stopfunc.regex.__qualname__ = "regex"
1085+
stopfunc.text = lambda s: stopfunc.printables(s) and entropy(s) < 4.6
1086+
stopfunc.text.__name__ = stopfunc.text.__qualname__ = "text"
1087+
stopfunc.flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", ensure_str(x)) is not None
1088+
stopfunc.flag.__name__ = stopfunc.flag.__qualname__ = "flag"
1089+
stopfunc.default = stopfunc.printables
1090+
1091+
1092+
def _detect(text):
1093+
_lb, t = LANG_BACKEND, ensure_str(text)
1094+
if _lb is None:
1095+
raise ValueError("No language backend installed")
1096+
return langid.classify(t)[0] if _lb == "langid" else \
1097+
langdetect.detect(t) if _lb == "langdetect" else \
1098+
pycld2.detect(t)[2][0][1] if _lb == "pycld2" else \
1099+
cld3.get_language(t).language[:2] if _lb == "cld3" else \
1100+
textblob.TextBlob(t).detect_language()[:2]
1101+
10611102

10621103
def _lang(lang):
10631104
def _test(s):
10641105
if not stopfunc.text(s):
10651106
return False
10661107
try:
1067-
return detect(ensure_str(s)) == lang
1108+
return _detect(ensure_str(s))[:2] == lang
10681109
except:
10691110
return False
10701111
return _test
10711112

1072-
try:
1073-
from langdetect import detect, PROFILES_DIRECTORY
1074-
for lang in [p.replace("-", "") for p in os.listdir(PROFILES_DIRECTORY)]:
1075-
setattr(stopfunc, "lang_%s" % lang, _lang(lang))
1076-
except ImportError:
1077-
pass
1078-
1079-
1080-
__flag = lambda x: re.search(r"[Ff][Ll1][Aa4@][Gg96]", x) is not None
1081-
def _flag(x):
1082-
return __flag(ensure_str(x))
1083-
stopfunc.flag = _flag
1113+
if LANG_BACKEND:
1114+
_lb = LANG_BACKEND
1115+
if _lb == "langid":
1116+
langid.langid.load_model()
1117+
for lang in (
1118+
langid.langid.identifier.nb_classes if _lb == "langid" else \
1119+
[p.replace("-", "") for p in os.listdir(langdetect.PROFILES_DIRECTORY)] if _lb == "langdetect" else \
1120+
list(set(x[1][:2] for x in pycld2.LANGUAGES if x[0] in pycld2.DETECTED_LANGUAGES)) if _lb == "pycld2" else \
1121+
CLD3_LANGUAGES if _lb == "cld3" else \
1122+
TEXTBLOB_LANGUAGES if _lb == "textblob" else \
1123+
[]):
1124+
n = "lang_%s" % lang
1125+
setattr(stopfunc, n, _lang(lang))
1126+
getattr(stopfunc, n).__name__ = getattr(stopfunc, n).__qualname__ = n
1127+
flng = "lang_%s" % LANG
1128+
if getattr(stopfunc, flng, None):
1129+
stopfunc.default = getattr(stopfunc, flng)
10841130

10851131

10861132
def __develop(encodings):
@@ -1140,7 +1186,7 @@ def expand(items, descr=None, transform=None):
11401186
if encoding in e:
11411187
continue
11421188
if debug:
1143-
print("[*] Depth %d/%d ; trying %s" % (depth+1, max_depth, encoding))
1189+
print("[*] Depth %0{}d/%d: %s".format(len(str(max_depth))) % (depth+1, max_depth, encoding))
11441190
__guess(input, new_input, stop_func, depth+1, max_depth, min_depth, codec_categories, exclude, result,
11451191
found + (encoding, ), stop, show, scoring_heuristic, extended, debug)
11461192

@@ -1236,7 +1282,7 @@ def __score(prev_input, input, codec, heuristic=False, extended=False):
12361282
yield s, new_input, encoding
12371283

12381284

1239-
def guess(input, stop_func=stopfunc.printables, min_depth=0, max_depth=5, codec_categories=None, exclude=None, found=(),
1285+
def guess(input, stop_func=stopfunc.default, min_depth=0, max_depth=5, codec_categories=None, exclude=None, found=(),
12401286
stop=True, show=False, scoring_heuristic=False, extended=False, debug=False):
12411287
""" Try decoding without the knowledge of the encoding(s). """
12421288
if max_depth <= 0:

0 commit comments

Comments
 (0)