1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.metrics.cardinality;
import com.carrotsearch.hppc.BitMixer;
import com.carrotsearch.hppc.IntHashSet;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.test.ESTestCase;
import static org.elasticsearch.search.aggregations.metrics.cardinality.HyperLogLogPlusPlus.MAX_PRECISION;
import static org.elasticsearch.search.aggregations.metrics.cardinality.HyperLogLogPlusPlus.MIN_PRECISION;
import static org.hamcrest.Matchers.closeTo;
public class HyperLogLogPlusPlusTests extends ESTestCase {
public void testEncodeDecode() {
final int iters = scaledRandomIntBetween(100000, 500000);
// random hashes
for (int i = 0; i < iters; ++i) {
final int p1 = randomIntBetween(4, 24);
final long hash = randomLong();
testEncodeDecode(p1, hash);
}
// special cases
for (int p1 = MIN_PRECISION; p1 <= MAX_PRECISION; ++p1) {
testEncodeDecode(p1, 0);
testEncodeDecode(p1, 1);
testEncodeDecode(p1, ~0L);
}
}
private void testEncodeDecode(int p1, long hash) {
final long index = HyperLogLogPlusPlus.index(hash, p1);
final int runLen = HyperLogLogPlusPlus.runLen(hash, p1);
final int encoded = HyperLogLogPlusPlus.encodeHash(hash, p1);
assertEquals(index, HyperLogLogPlusPlus.decodeIndex(encoded, p1));
assertEquals(runLen, HyperLogLogPlusPlus.decodeRunLen(encoded, p1));
}
public void testAccuracy() {
final long bucket = randomInt(20);
final int numValues = randomIntBetween(1, 100000);
final int maxValue = randomIntBetween(1, randomBoolean() ? 1000: 100000);
final int p = randomIntBetween(14, MAX_PRECISION);
IntHashSet set = new IntHashSet();
HyperLogLogPlusPlus e = new HyperLogLogPlusPlus(p, BigArrays.NON_RECYCLING_INSTANCE, 1);
for (int i = 0; i < numValues; ++i) {
final int n = randomInt(maxValue);
set.add(n);
final long hash = BitMixer.mix64(n);
e.collect(bucket, hash);
if (randomInt(100) == 0) {
//System.out.println(e.cardinality(bucket) + " <> " + set.size());
assertThat((double) e.cardinality(bucket), closeTo(set.size(), 0.1 * set.size()));
}
}
assertThat((double) e.cardinality(bucket), closeTo(set.size(), 0.1 * set.size()));
}
public void testMerge() {
final int p = randomIntBetween(MIN_PRECISION, MAX_PRECISION);
final HyperLogLogPlusPlus single = new HyperLogLogPlusPlus(p, BigArrays.NON_RECYCLING_INSTANCE, 0);
final HyperLogLogPlusPlus[] multi = new HyperLogLogPlusPlus[randomIntBetween(2, 100)];
final long[] bucketOrds = new long[multi.length];
for (int i = 0; i < multi.length; ++i) {
bucketOrds[i] = randomInt(20);
multi[i] = new HyperLogLogPlusPlus(p, BigArrays.NON_RECYCLING_INSTANCE, 5);
}
final int numValues = randomIntBetween(1, 100000);
final int maxValue = randomIntBetween(1, randomBoolean() ? 1000: 1000000);
for (int i = 0; i < numValues; ++i) {
final int n = randomInt(maxValue);
final long hash = BitMixer.mix64(n);
single.collect(0, hash);
// use a gaussian so that all instances don't collect as many hashes
final int index = (int) (Math.pow(randomDouble(), 2));
multi[index].collect(bucketOrds[index], hash);
if (randomInt(100) == 0) {
HyperLogLogPlusPlus merged = new HyperLogLogPlusPlus(p, BigArrays.NON_RECYCLING_INSTANCE, 0);
for (int j = 0; j < multi.length; ++j) {
merged.merge(0, multi[j], bucketOrds[j]);
}
assertEquals(single.cardinality(0), merged.cardinality(0));
}
}
}
public void testFakeHashes() {
// hashes with lots of leading zeros trigger different paths in the code that we try to go through here
final int p = randomIntBetween(MIN_PRECISION, MAX_PRECISION);
final HyperLogLogPlusPlus counts = new HyperLogLogPlusPlus(p, BigArrays.NON_RECYCLING_INSTANCE, 0);
counts.collect(0, 0);
assertEquals(1, counts.cardinality(0));
if (randomBoolean()) {
counts.collect(0, 1);
assertEquals(2, counts.cardinality(0));
}
counts.upgradeToHll(0);
// all hashes felt into the same bucket so hll would expect a count of 1
assertEquals(1, counts.cardinality(0));
}
public void testPrecisionFromThreshold() {
assertEquals(4, HyperLogLogPlusPlus.precisionFromThreshold(0));
assertEquals(6, HyperLogLogPlusPlus.precisionFromThreshold(10));
assertEquals(10, HyperLogLogPlusPlus.precisionFromThreshold(100));
assertEquals(13, HyperLogLogPlusPlus.precisionFromThreshold(1000));
assertEquals(16, HyperLogLogPlusPlus.precisionFromThreshold(10000));
assertEquals(18, HyperLogLogPlusPlus.precisionFromThreshold(100000));
assertEquals(18, HyperLogLogPlusPlus.precisionFromThreshold(1000000));
}
}
|