1212import re
1313import sys
1414
15+ from math import log
1516import collections
1617import gettext
1718import os .path
@@ -111,9 +112,10 @@ def __init__(self, words):
111112
112113 def iter_words (self , text ):
113114 s = []
115+ words = self .words
114116 for m in self .pat .finditer (text ):
115117 t = m .group (0 )
116- if t in self . words :
118+ if t in words :
117119 if s :
118120 yield (False , "" .join (s ))
119121 s = []
@@ -124,33 +126,35 @@ def iter_words(self, text):
124126 yield (False , "" .join (s ))
125127
126128 def iter (self , text ):
127- s = []
128129 for m in self .pat .finditer (text ):
129130 yield m .group (0 )
130131
131132def iter_substrings (s , minlen , maxlen ):
132- maxlen = min (len (s ), maxlen )
133+ len_s = len (s )
134+ maxlen = min (len_s , maxlen )
133135 for n in range (minlen , maxlen + 1 ):
134- for begin in range (0 , len ( s ) - n + 1 ):
136+ for begin in range (0 , len_s - n + 1 ):
135137 yield s [begin : begin + n ]
136138
137139def compute_huffman_coding (translations , compression_filename ):
138140 texts = [t [1 ] for t in translations ]
139- all_strings_concat = "" .join (texts )
140141 words = []
142+
143+ start_unused = 0x80
144+ end_unused = 0xff
141145 max_ord = 0
142- begin_unused = 128
143- end_unused = 256
144146 for text in texts :
145147 for c in text :
146148 ord_c = ord (c )
147- max_ord = max (max_ord , ord_c )
148- if 128 <= ord_c < 256 :
149+ max_ord = max (ord_c , max_ord )
150+ if 0x80 <= ord_c < 0xff :
149151 end_unused = min (ord_c , end_unused )
150- max_words = end_unused - begin_unused
151- char_size = 1 if max_ord < 256 else 2
152+ max_words = end_unused - 0x80
153+
154+ values_type = "uint16_t" if max_ord > 255 else "uint8_t"
155+ max_words_len = 160 if max_ord > 255 else 255
152156
153- sum_word_len = 0
157+ sum_len = 0
154158 while True :
155159 extractor = TextSplitter (words )
156160 counter = collections .Counter ()
@@ -162,30 +166,30 @@ def compute_huffman_coding(translations, compression_filename):
162166
163167 scores = sorted (
164168 (
165- # I don't know why this works good. This could be better.
166- (s , (len (s ) - 1 ) ** ((max (occ - 2 , 1 ) + 0.5 ) ** 0.8 ), occ )
169+ (s , (len (s ) - 1 ) ** log (max (occ - 2 , 1 )), occ )
167170 for (s , occ ) in counter .items ()
168171 ),
169172 key = lambda x : x [1 ],
170173 reverse = True ,
171174 )
172175
173- w = None
176+ word = None
174177 for (s , score , occ ) in scores :
175- if score < 0 :
176- break
177- if len (s ) > 1 :
178- w = s
178+ if occ < 5 :
179+ continue
180+ if score < 5 :
179181 break
182+ word = s
183+ break
180184
181- if not w :
185+ if not word :
182186 break
183- if len (w ) + sum_word_len > 256 :
187+ if sum_len + len (word ) - 2 > max_words_len :
184188 break
185189 if len (words ) == max_words :
186190 break
187- words .append (w )
188- sum_word_len += len (w )
191+ words .append (word )
192+ sum_len += len (word ) - 2
189193
190194 extractor = TextSplitter (words )
191195 counter = collections .Counter ()
@@ -194,26 +198,26 @@ def compute_huffman_coding(translations, compression_filename):
194198 counter [atom ] += 1
195199 cb = huffman .codebook (counter .items ())
196200
197- word_start = begin_unused
201+ word_start = start_unused
198202 word_end = word_start + len (words ) - 1
199203 print ("// # words" , len (words ))
200204 print ("// words" , words )
201205
202206 values = []
203207 length_count = {}
204208 renumbered = 0
205- last_l = None
209+ last_length = None
206210 canonical = {}
207211 for atom , code in sorted (cb .items (), key = lambda x : (len (x [1 ]), x [0 ])):
208212 values .append (atom )
209- l = len (code )
210- if l not in length_count :
211- length_count [l ] = 0
212- length_count [l ] += 1
213- if last_l :
214- renumbered <<= (l - last_l )
215- canonical [atom ] = '{0:0{width}b}' .format (renumbered , width = l )
216- #print(f"atom={repr(atom)} code={code}", file=sys.stderr)
213+ length = len (code )
214+ if length not in length_count :
215+ length_count [length ] = 0
216+ length_count [length ] += 1
217+ if last_length :
218+ renumbered <<= (length - last_length )
219+ canonical [atom ] = '{0:0{width}b}' .format (renumbered , width = length )
220+ # print(f"atom={repr(atom)} code={code}", file=sys.stderr)
217221 if len (atom ) > 1 :
218222 o = words .index (atom ) + 0x80
219223 s = "" .join (C_ESCAPES .get (ch1 , ch1 ) for ch1 in atom )
@@ -222,34 +226,37 @@ def compute_huffman_coding(translations, compression_filename):
222226 o = ord (atom )
223227 print ("//" , o , s , counter [atom ], canonical [atom ], renumbered )
224228 renumbered += 1
225- last_l = l
229+ last_length = length
226230 lengths = bytearray ()
227231 print ("// length count" , length_count )
228232
229233 for i in range (1 , max (length_count ) + 2 ):
230234 lengths .append (length_count .get (i , 0 ))
231235 print ("// values" , values , "lengths" , len (lengths ), lengths )
232- maxord = max (ord (u ) for u in values if len (u ) == 1 )
233- values_type = "uint16_t" if maxord > 255 else "uint8_t"
234- ch_size = 1 if maxord > 255 else 2
236+
235237 print ("//" , values , lengths )
236238 values = [(atom if len (atom ) == 1 else chr (0x80 + words .index (atom ))) for atom in values ]
237239 print ("//" , values , lengths )
238- max_translation_encoded_length = max (len (translation .encode ("utf-8" )) for original ,translation in translations )
240+ max_translation_encoded_length = max (
241+ len (translation .encode ("utf-8" )) for (original , translation ) in translations )
242+
243+ wends = list (len (w ) - 2 for w in words )
244+ for i in range (1 , len (wends )):
245+ wends [i ] += wends [i - 1 ]
246+
239247 with open (compression_filename , "w" ) as f :
240248 f .write ("const uint8_t lengths[] = {{ {} }};\n " .format (", " .join (map (str , lengths ))))
241249 f .write ("const {} values[] = {{ {} }};\n " .format (values_type , ", " .join (str (ord (u )) for u in values )))
242250 f .write ("#define compress_max_length_bits ({})\n " .format (max_translation_encoded_length .bit_length ()))
243251 f .write ("const {} words[] = {{ {} }};\n " .format (values_type , ", " .join (str (ord (c )) for w in words for c in w )))
244- f .write ("const uint8_t wlen [] = {{ {} }};\n " .format (", " .join (str (len ( w )) for w in words )))
252+ f .write ("const uint8_t wends [] = {{ {} }};\n " .format (", " .join (str (p ) for p in wends )))
245253 f .write ("#define word_start {}\n " .format (word_start ))
246254 f .write ("#define word_end {}\n " .format (word_end ))
247255
248- extractor = TextSplitter (words )
249- return values , lengths , words , extractor
256+ return (values , lengths , words , canonical , extractor )
250257
251258def decompress (encoding_table , encoded , encoded_length_bits ):
252- values , lengths , words , extractor = encoding_table
259+ ( values , lengths , words , _ , _ ) = encoding_table
253260 dec = []
254261 this_byte = 0
255262 this_bit = 7
@@ -306,66 +313,32 @@ def decompress(encoding_table, encoded, encoded_length_bits):
306313def compress (encoding_table , decompressed , encoded_length_bits , len_translation_encoded ):
307314 if not isinstance (decompressed , str ):
308315 raise TypeError ()
309- values , lengths , words , extractor = encoding_table
316+ ( _ , _ , _ , canonical , extractor ) = encoding_table
310317
311318 enc = bytearray (len (decompressed ) * 3 )
312- #print(decompressed)
313- #print(lengths)
314319 current_bit = 7
315320 current_byte = 0
316321
317- code = len_translation_encoded
318- bits = encoded_length_bits + 1
322+ bits = encoded_length_bits + 1
319323 for i in range (bits - 1 , 0 , - 1 ):
320324 if len_translation_encoded & (1 << (i - 1 )):
321325 enc [current_byte ] |= 1 << current_bit
322326 if current_bit == 0 :
323327 current_bit = 7
324- #print("packed {0:0{width}b}".format(enc[current_byte], width=8))
325328 current_byte += 1
326329 else :
327330 current_bit -= 1
328331
329- #print("values = ", values, file=sys.stderr)
330332 for atom in extractor .iter (decompressed ):
331- #print("", file=sys.stderr)
332- if len (atom ) > 1 :
333- c = chr (0x80 + words .index (atom ))
334- else :
335- c = atom
336- assert c in values
337-
338- start = 0
339- end = lengths [0 ]
340- bits = 1
341- compressed = None
342- code = 0
343- while compressed is None :
344- s = start
345- e = end
346- #print("{0:0{width}b}".format(code, width=bits))
347- # Linear search!
348- for i in range (s , e ):
349- if values [i ] == c :
350- compressed = code + (i - start )
351- #print("found {0:0{width}b}".format(compressed, width=bits), file=sys.stderr)
352- break
353- code += end - start
354- code <<= 1
355- start = end
356- end += lengths [bits ]
357- bits += 1
358- #print("next bit", bits)
359-
360- for i in range (bits - 1 , 0 , - 1 ):
361- if compressed & (1 << (i - 1 )):
333+ for b in canonical [atom ]:
334+ if b == "1" :
362335 enc [current_byte ] |= 1 << current_bit
363336 if current_bit == 0 :
364337 current_bit = 7
365- #print("packed {0:0{width}b}".format(enc[current_byte], width=8))
366338 current_byte += 1
367339 else :
368340 current_bit -= 1
341+
369342 if current_bit != 7 :
370343 current_byte += 1
371344 return enc [:current_byte ]
0 commit comments