8000 Merge pull request #4564 from tyomitch/patch-1 · jun2sak/circuitpython@e54e5e3 · GitHub
[go: up one dir, main page]

Skip to content

Commit e54e5e3

Browse files
authored
Merge pull request adafruit#4564 from tyomitch/patch-1
[build] simplify makeqstrdata heuristic
2 parents 83c768b + dcee89a commit e54e5e3

File tree

1 file changed

+7
-15
lines changed

1 file changed

+7
-15
lines changed

py/makeqstrdata.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import re
1313
import sys
1414

15-
from math import log
1615
import collections
1716
import gettext
1817
import os.path
@@ -167,7 +166,7 @@ def compute_huffman_coding(translations, compression_filename):
167166
sum_len = 0
168167
while True:
169168
# Until the dictionary is filled to capacity, use a heuristic to find
170-
# the best "word" (2- to 9-gram) to add to it.
169+
# the best "word" (3- to 9-gram) to add to it.
171170
#
172171
# The TextSplitter allows us to avoid considering parts of the text
173172
# that are already covered by a previously chosen word, for example
@@ -179,32 +178,25 @@ def compute_huffman_coding(translations, compression_filename):
179178
for t in texts:
180179
for (found, word) in extractor.iter_words(t):
181180
if not found:
182-
for substr in iter_substrings(word, minlen=2, maxlen=9):
181+
for substr in iter_substrings(word, minlen=3, maxlen=9):
183182
counter[substr] += 1
184183

185184
# Score the candidates we found. This is an empirical formula only,
186185
# chosen for its effectiveness.
187186
scores = sorted(
188-
((s, (len(s) - 1) ** log(max(occ - 2, 1)), occ) for (s, occ) in counter.items()),
187+
((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4),
189188
key=lambda x: x[1],
190189
reverse=True,
191190
)
192191

193-
# Do we have a "word" that occurred 5 times and got a score of at least
194-
# 5? Horray. Pick the one with the highest score.
195-
word = None
196-
for (s, score, occ) in scores:
197-
if occ < 5:
198-
continue
199-
if score < 5:
200-
break
201-
word = s
192+
# Pick the one with the highest score.
193+
if not scores:
202194
break
203195

196+
word = scores[0][0]
197+
204198
# If we can successfully add it to the dictionary, do so. Otherwise,
205199
# we've filled the dictionary to capacity and are done.
206-
if not word:
207-
break
208200
if sum_len + len(word) - 2 > max_words_len:
209201
break
210202
if len(words) == max_words:

0 commit comments

Comments
 (0)
0