11# -*- coding: UTF-8 -*-
22import _codecs
33import codecs
4+ import json
45import os
56import random
67import re
1112from inspect import currentframe
1213from itertools import chain , product
1314from math import log
15+ from random import randint
1416from six import binary_type , string_types , text_type , BytesIO
1517from string import *
1618from types import FunctionType , ModuleType
2830 maketrans = str .maketrans
2931
3032
31- __all__ = ["add" , "add_map" , "b" , "clear" , "codecs" , "decode" , "encode" , "ensure_str" , "examples" , "guess" , "isb" ,
32- "generate_strings_from_regex" , "get_alphabet_from_mask" , "handle_error" , "is_native" , "list_categories" ,
33- "list_encodings" , "lookup" , "maketrans" , "rank" , "re" , "register" , "remove" , "reset" , "s2i" , "search" ,
34- "stopfunc" , "BytesIO" , "MASKS" , "PY3" , "_input" , "_stripl" ]
33+ __all__ = ["add" , "add_macro" , "add_map" , "b" , "clear" , "codecs" , "decode" , "encode" , "ensure_str" , "examples" , "guess" ,
34+ "isb" , "generate_strings_from_regex" , "get_alphabet_from_mask" , "handle_error" , "is_native" ,
35+ "list_categories" , "list_encodings" , "list_macros" , "lookup" , "maketrans" , "os" , "rank" , "re" , "register" ,
36+ "remove" , "remove_macro" , "reset" , "s2i" , "search" , "stopfunc" , "BytesIO" , "MASKS" , "PY3" , "_input" ,
37+ "_stripl" , "CodecMacro" ]
3538CODECS_REGISTRY = None
3639CODECS_CATEGORIES = ["native" , "custom" ]
3740MASKS = {
4851PY3 = sys .version [0 ] == "3"
4952__codecs_registry = []
5053
54+ MACROS = {}
55+ PERS_MACROS = {}
56+ PERS_MACROS_FILE = os .path .expanduser ("~/.codext-macros.json" )
57+
5158
5259entropy = lambda s : - sum ([p * log (p , 2 ) for p in [float (s .count (c )) / len (s ) for c in set (s )]])
5360
5865s2i = lambda s : int (codecs .encode (s , "base16" ), 16 )
5966
6067
68+ class CodecMacro (tuple ):
69+ """Macro details when looking up the codec registry. """
70+ def __new__ (cls , name ):
71+ self = tuple .__new__ (cls )
72+ self .name = name
73+ # get from personal macros first
74+ try :
75+ self .codecs = PERS_MACROS [name ]
76+ except KeyError :
77+ try :
78+ self .codecs = MACROS [name ]
79+ except KeyError :
80+ raise LookupError ("unknown macro: %s" % name )
81+ if not isinstance (self .codecs , (tuple , list )):
82+ raise ValueError ("bad macro list: %s" % str (self .codecs ))
83+ self .codecs = [lookup (e , False ) for e in self .codecs ] # lookup(e, False)
84+ self .parameters = {'name' : name , 'category' : "macro" } # ^ means that macros won't be nestable
85+ # test examples to check that the chain of encodings works
86+ for action , examples in (self .codecs [0 ].parameters .get ('examples' , {}) or {}).items ():
87+ if re .match (r"enc(-dec)?\(" , action ):
88+ for e in (examples .keys () if action .startswith ("enc(" ) else examples or []):
89+ rd = re .match (r"\@random(?:\{(\d+(?:,(\d+))*?)\})?$" , e )
90+ if rd :
91+ for n in (rd .group (1 ) or "512" ).split ("," ):
92+ self .encode ("" .join (chr (randint (0 , 255 )) for i in range (int (n ))))
93+ continue
94+ self .encode (e )
95+
96+ class Codec :
97+ decode = self .decode
98+ encode = self .encode
99+
100+ class IncrementalEncoder (codecs .IncrementalEncoder ):
101+ def encode (self , input , final = False ):
102+ return b (self .encode (input , self .errors )[0 ])
103+ self .incrementalencoder = IncrementalEncoder
104+
105+ class IncrementalDecoder (codecs .IncrementalDecoder ):
106+ def decode (self , input , final = False ):
107+ return ensure_str (self .decode (input , self .errors )[0 ])
108+ self .incrementaldecoder = IncrementalDecoder
109+
110+ class StreamWriter (Codec , codecs .StreamWriter ):
111+ charbuffertype = bytes
112+ self .streamwriter = StreamWriter
113+
114+ class StreamReader (Codec , codecs .StreamReader ):
115+ charbuffertype = bytes
116+ self .streamreader = StreamReader
117+
118+ return self
119+
120+ def decode (self , input , error = "strict" ):
121+ """ Decode with each codec in reverse order. """
122+ for ci in self .codecs [::- 1 ]:
123+ input , l = ci .decode (input , error )
124+ return input , l
125+
126+ def encode (self , input , error = "strict" ):
127+ """ Encode with each codec. """
128+ for ci in self .codecs :
129+ input , l = ci .encode (input , error )
130+ return input , l
131+
132+ def __repr__ (self ):
133+ return "<codext.CodecMacro object for encoding %s at %#x>" % (self .name , id (self ))
134+
135+
61136def __stdin_pipe ():
62137 """ Stdin pipe read function. """
63138 try :
@@ -215,6 +290,28 @@ class StreamReader(Codec, codecs.StreamReader):
215290 register (getregentry , add_to_codecs )
216291
217292
293+ def add_macro (mname , * encodings ):
294+ """ This allows to define a macro, chaining multiple codecs one after the other. This relies on a default set of
295+ macros from a YAML file embedded in the package and a local YAML file from the home folder that takes
296+ precedence for defining personal macros.
297+
298+ :param mname: macro name
299+ :param encodings: encoding names of the encodings to be chained with the macro
300+ """
301+ # check for name clash with alreday existing macros and codecs
302+ if mname in MACROS or mname in PERS_MACROS :
303+ raise ValueError ("Macro name already exists" )
304+ try :
305+ ci = lookup (mname , False )
306+ raise ValueError ("Macro name clashes with codec '%s'" % ci .name )
307+ except LookupError :
308+ #TODO: test if the encodings sequence can work, using an example from the first codec
309+ PERS_MACROS [mname ] = encodings
310+ with open (PERS_MACROS_FILE , 'w' ) as f :
311+ json .dump (PERS_MACROS , f )
312+ codecs .add_macro = add_macro
313+
314+
218315def add_map (ename , encmap , repl_char = "?" , sep = "" , ignore_case = None , no_error = False , intype = None , outype = None , ** kwargs ):
219316 """ This adds a new mapping codec (that is, declarable with a simple character mapping dictionary) to the codecs
220317 module dynamically setting its encode and/or decode functions, eventually dynamically naming the encoding with
@@ -474,7 +571,7 @@ def examples(encoding, number=10):
474571 while i < min (number , len (temp )):
475572 if not temp [i ].isdigit ():
476573 try :
477- lookup (temp [i ])
574+ lookup (temp [i ], False )
478575 e .append (temp [i ])
479576 except LookupError :
480577 pass
@@ -492,7 +589,7 @@ def examples(encoding, number=10):
492589
493590def is_native (encoding ):
494591 """ Determine if a given encoding is native or not. """
495- return codecs . lookup (encoding ).parameters ['category' ] == "native"
592+ return lookup (encoding , False ).parameters ['category' ] == "native"
496593
497594
498595def list_categories ():
@@ -546,6 +643,11 @@ def list_encodings(*categories):
546643 return sorted (list (set (enc )), key = _human_keys )
547644
548645
646+ def list_macros ():
647+ """ Get a list of all macros, with the precedence on personal ones. """
648+ return sorted (list (set (list (MACROS .keys ()) + list (PERS_MACROS .keys ()))))
649+
650+
549651def remove (encoding ):
550652 """ Remove all search functions matching the input encoding name from codext's local registry. """
551653 tbr = []
@@ -557,9 +659,23 @@ def remove(encoding):
557659codecs .remove = remove
558660
559661
662+ def remove_macro (name ):
663+ """ Remove the given macro from the macro registries. """
664+ try :
665+ del MACROS [name ]
666+ except KeyError :
667+ pass
668+ try :
669+ del PERS_MACROS [name ]
670+ with open (PERS_MACROS_FILE , 'w' ) as f :
671+ json .dump (PERS_MACROS , f )
672+ except KeyError :
673+ pass
674+
675+
560676def reset ():
561- """ Reset codext's local registry of search functions. """
562- global CODECS_REGISTRY , __codecs_registry
677+ """ Reset codext's local registry of search functions and macros . """
678+ global CODECS_REGISTRY , MACROS , PERS_MACROS , __codecs_registry
563679 clear ()
564680 d = os .path .dirname (__file__ )
565681 for pkg in sorted (os .listdir (d )):
@@ -572,6 +688,14 @@ def reset():
572688 # restore codext's registry
573689 else :
574690 __codecs_registry = CODECS_REGISTRY [:]
691+ # restore codext's embedded set of macros
692+ with open (os .path .join (os .path .dirname (__file__ ), "macros.json" )) as f :
693+ MACROS = json .load (f )
694+ # reload personal set of macros
695+ PERS_MACROS = {}
696+ if os .path .exists (PERS_MACROS_FILE ):
697+ with open (PERS_MACROS_FILE ) as f :
698+ PERS_MACROS = json .load (f )
575699codecs .reset = reset
576700
577701
@@ -709,7 +833,7 @@ def encode(obj, encoding='utf-8', errors='strict'):
709833codecs .encode = encode
710834
711835
712- def lookup (encoding ):
836+ def lookup (encoding , macro = True ):
713837 """ Hooked lookup function for searching first for codecs in the local registry of this module. """
714838 # first, try to match the given encoding with codecs' search functions
715839 for search_function in __codecs_registry :
@@ -723,10 +847,18 @@ def lookup(encoding):
723847 codecinfo = search_function (generate_string_from_regex (search_function .__pattern__ ))
724848 if codecinfo is not None :
725849 return codecinfo
726- # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters
727- ci = __orig_lookup (encoding )
728- ci .parameters = {'category' : "native" , 'module' : "codecs" , 'name' : aliases .get (ci .name , ci .name )}
729- return ci
850+ try :
851+ # finally, get a CodecInfo with the original lookup function and refine it with a dictionary of parameters
852+ ci = __orig_lookup (encoding )
853+ ci .parameters = {'category' : "native" , 'module' : "codecs" , 'name' : aliases .get (ci .name , ci .name )}
854+ return ci
855+ except LookupError :
856+ if not macro :
857+ raise
858+ try :
859+ return CodecMacro (encoding )
860+ except LookupError :
861+ raise LookupError ("unknown encoding: %s" % encoding )
730862codecs .lookup = lookup
731863
732864
@@ -945,7 +1077,7 @@ def __develop(encodings):
9451077 enc = []
9461078 for e in (encodings or []):
9471079 try :
948- ci = lookup (e )
1080+ ci = lookup (e , False )
9491081 g = ci .parameters ['guess' ]
9501082 except :
9511083 g = [e ]
@@ -1023,7 +1155,7 @@ def __init__(self, text, pad_char=None):
10231155
10241156def __score (prev_input , input , codec , heuristic = False , extended = False ):
10251157 """ Score relevant encodings given an input. """
1026- obj , ci = None , lookup (codec ) # NB: lookup(...) won't fail as the codec value comes from list_encodings(...)
1158+ obj , ci = None , lookup (codec , False ) # NB: lookup(...) won't fail as the codec value comes from list_encodings(...)
10271159 sc = ci .parameters .get ('scoring' , {})
10281160 for encoding in ci .parameters .get ('guess' , [codec ]):
10291161 # ignore encodings that fail to decode with their default errors handling value
0 commit comments