aboutsummaryrefslogtreecommitdiff
path: root/py/makecompresseddata.py
diff options
context:
space:
mode:
authorDamien George <damien.p.george@gmail.com>2020-04-19 23:47:22 +1000
committerDamien George <damien.p.george@gmail.com>2020-04-20 10:32:49 +1000
commit388d419ba39b061923d2568814195e8bf73330d4 (patch)
treebf381425e4a1d36fa77954a217e74af11c30520c /py/makecompresseddata.py
parent1b1ceb67b25e0ea56c1e972514a48468fe478ad3 (diff)
py/makecompresseddata.py: Make compression deterministic.
Error string compression is not deterministic in certain cases: it depends on the Python version (whether dicts are ordered by default or not) and probably also the order files are passed to this script, leading to a difference in which words are included in the top 128 most common. The changes in this commit use OrderedDict to keep parsed lines in a known order, and, when computing how many bytes are saved by a given word, it uses the word itself to break ties (which would otherwise be "random").
Diffstat (limited to 'py/makecompresseddata.py')
-rw-r--r--py/makecompresseddata.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/py/makecompresseddata.py b/py/makecompresseddata.py
index 28223a6d9..9603de871 100644
--- a/py/makecompresseddata.py
+++ b/py/makecompresseddata.py
@@ -51,9 +51,10 @@ def word_compression(error_strings):
topn[word] += 1
# Order not just by frequency, but by expected saving. i.e. prefer a longer string that is used less frequently.
+ # Use the word itself for ties so that compression is deterministic.
def bytes_saved(item):
w, n = item
- return -((len(w) + 1) * (n - 1))
+ return -((len(w) + 1) * (n - 1)), w
top128 = sorted(topn.items(), key=bytes_saved)[:128]
@@ -143,7 +144,7 @@ def ngram_compression(error_strings):
def main(collected_path, fn):
- error_strings = {}
+ error_strings = collections.OrderedDict()
max_uncompressed_len = 0
num_uses = 0