@@ -103,14 +103,10 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
103103 # go through each qstr and print it out
104104 for _ , _ , qstr in qstrs .values ():
105105 all_strings .append (qstr )
106- all_strings_concat = "" .join (all_strings ). encode ( "utf-8" )
106+ all_strings_concat = "" .join (all_strings )
107107 counts = collections .Counter (all_strings_concat )
108- # add other values
109- for i in range (256 ):
110- if i not in counts :
111- counts [i ] = 0
112108 cb = huffman .codebook (counts .items ())
113- values = bytearray ()
109+ values = []
114110 length_count = {}
115111 renumbered = 0
116112 last_l = None
@@ -124,26 +120,27 @@ def compute_huffman_coding(translations, qstrs, compression_filename):
124120 if last_l :
125121 renumbered <<= (l - last_l )
126122 canonical [ch ] = '{0:0{width}b}' .format (renumbered , width = l )
127- if chr (ch ) in C_ESCAPES :
128- s = C_ESCAPES [chr (ch )]
129- else :
130- s = chr (ch )
131- print ("//" , ch , s , counts [ch ], canonical [ch ], renumbered )
123+ s = C_ESCAPES .get (ch , ch )
124+ print ("//" , ord (ch ), s , counts [ch ], canonical [ch ], renumbered )
132125 renumbered += 1
133126 last_l = l
134127 lengths = bytearray ()
135- for i in range (1 , max (length_count ) + 1 ):
128+ print ("// length count" , length_count )
129+ for i in range (1 , max (length_count ) + 2 ):
136130 lengths .append (length_count .get (i , 0 ))
131+ print ("// values" , values , "lengths" , len (lengths ), lengths )
132+ print ("// estimated total memory size" , len (lengths ) + 2 * len (values ) + sum (len (cb [u ]) for u in all_strings_concat ))
137133 print ("//" , values , lengths )
134+ values_type = "uint16_t" if max (ord (u ) for u in values ) > 255 else "uint8_t"
138135 with open (compression_filename , "w" ) as f :
139136 f .write ("const uint8_t lengths[] = {{ {} }};\n " .format (", " .join (map (str , lengths ))))
140- f .write ("const uint8_t values[256 ] = {{ {} }};\n " .format (", " .join (map ( str , values ) )))
137+ f .write ("const {} values[] = {{ {} }};\n " .format (values_type , ", " .join (str ( ord ( u )) for u in values )))
141138 return values , lengths
142139
143140def decompress (encoding_table , length , encoded ):
144141 values , lengths = encoding_table
145142 #print(l, encoded)
146- dec = bytearray ( length )
143+ dec = []
147144 this_byte = 0
148145 this_bit = 7
149146 b = encoded [this_byte ]
@@ -173,14 +170,14 @@ def decompress(encoding_table, length, encoded):
173170 searched_length += lengths [bit_length ]
174171
175172 v = values [searched_length + bits - max_code ]
176- dec [ i ] = v
177- return dec
173+ dec . append ( v )
174+ return '' . join ( dec )
178175
179176def compress (encoding_table , decompressed ):
180- if not isinstance (decompressed , bytes ):
177+ if not isinstance (decompressed , str ):
181178 raise TypeError ()
182179 values , lengths = encoding_table
183- enc = bytearray (len (decompressed ) * 2 )
180+ enc = bytearray (len (decompressed ) * 3 )
184181 #print(decompressed)
185182 #print(lengths)
186183 current_bit = 7
@@ -228,7 +225,7 @@ def compress(encoding_table, decompressed):
228225 if current_bit != 7 :
229226 current_byte += 1
230227 if current_byte > len (decompressed ):
231- print ("Note: compression increased length" , repr (decompressed . decode ( 'utf-8' ) ), len (decompressed ), current_byte , file = sys .stderr )
228+ print ("Note: compression increased length" , repr (decompressed ), len (decompressed ), current_byte , file = sys .stderr )
232229 return enc [:current_byte ]
233230
234231def qstr_escape (qst ):
@@ -347,9 +344,9 @@ def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns):
347344 total_text_compressed_size = 0
348345 for original , translation in i18ns :
349346 translation_encoded = translation .encode ("utf-8" )
350- compressed = compress (encoding_table , translation_encoded )
347+ compressed = compress (encoding_table , translation )
351348 total_text_compressed_size += len (compressed )
352- decompressed = decompress (encoding_table , len (translation_encoded ), compressed ). decode ( "utf-8" )
349+ decompressed = decompress (encoding_table , len (translation_encoded ), compressed )
353350 for c in C_ESCAPES :
354351 decompressed = decompressed .replace (c , C_ESCAPES [c ])
355352 print ("TRANSLATION(\" {}\" , {}, {{ {} }}) // {}" .format (original , len (translation_encoded )+ 1 , ", " .join (["0x{:02x}" .format (x ) for x in compressed ]), decompressed ))
0 commit comments