12
12
import re
13
13
import sys
14
14
15
- from math import log
16
15
import collections
17
16
import gettext
18
17
import os .path
@@ -167,7 +166,7 @@ def compute_huffman_coding(translations, compression_filename):
167
166
sum_len = 0
168
167
while True :
169
168
# Until the dictionary is filled to capacity, use a heuristic to find
170
- # the best "word" (2 - to 9-gram) to add to it.
169
+ # the best "word" (3 - to 9-gram) to add to it.
171
170
#
172
171
# The TextSplitter allows us to avoid considering parts of the text
173
172
# that are already covered by a previously chosen word, for example
@@ -179,32 +178,25 @@ def compute_huffman_coding(translations, compression_filename):
179
178
for t in texts :
180
179
for (found , word ) in extractor .iter_words (t ):
181
180
if not found :
182
- for substr in iter_substrings (word , minlen = 2 , maxlen = 9 ):
181
+ for substr in iter_substrings (word , minlen = 3 , maxlen = 9 ):
183
182
counter [substr ] += 1
184
183
185
184
# Score the candidates we found. This is an empirical formula only,
186
185
# chosen for its effectiveness.
187
186
scores = sorted (
188
- ((s , (len (s ) - 1 ) ** log ( max ( occ - 2 , 1 )), occ ) for (s , occ ) in counter .items ()),
187
+ ((s , (len (s ) - 1 ) ** ( occ + 4 )) for (s , occ ) in counter .items () if occ > 4 ),
189
188
key = lambda x : x [1 ],
190
189
reverse = True ,
191
190
)
192
191
193
- # Do we have a "word" that occurred 5 times and got a score of at least
194
- # 5? Horray. Pick the one with the highest score.
195
- word = None
196
- for (s , score , occ ) in scores :
197
- if occ < 5 :
198
- continue
199
- if score < 5 :
200
- break
201
- word = s
192
+ # Pick the one with the highest score.
193
+ if not scores :
202
194
break
203
195
196
+ word = scores [0 ][0 ]
197
+
204
198
# If we can successfully add it to the dictionary, do so. Otherwise,
205
199
# we've filled the dictionary to capacity and are done.
206
- if not word :
207
- break
208
200
if sum_len + len (word ) - 2 > max_words_len :
209
201
break
210
202
if len (words ) == max_words :
0 commit comments