core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/heuristics/JLHScore.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.elasticsearch.search.aggregations.bucket.significant.heuristics;


import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.query.QueryShardException;

import java.io.IOException;

public class JLHScore extends SignificanceHeuristic {
    public static final String NAME = "jlh";

    public JLHScore() {
    }

    /**
     * Read from a stream.
     */
    public JLHScore(StreamInput in) {
        // Nothing to read.
    }

    @Override
    public void writeTo(StreamOutput out) throws IOException {
    }

    @Override
    public String getWriteableName() {
        return NAME;
    }

    /**
     * Calculates the significance of a term in a sample against a background of
     * normal distributions by comparing the changes in frequency. This is the heart
     * of the significant terms feature.
     */
    @Override
    public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
        checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, "JLHScore");
        if ((subsetSize == 0) || (supersetSize == 0)) {
            // avoid any divide by zero issues
            return 0;
        }
        if (supersetFreq == 0) {
            // If we are using a background context that is not a strict superset, a foreground
            // term may be missing from the background, so for the purposes of this calculation
            // we assume a value of 1 for our calculations which avoids returning an "infinity" result
            supersetFreq = 1;
        }
        double subsetProbability = (double) subsetFreq / (double) subsetSize;
        double supersetProbability = (double) supersetFreq / (double) supersetSize;

        // Using absoluteProbabilityChange alone favours very common words e.g. you, we etc
        // because a doubling in popularity of a common term is a big percent difference
        // whereas a rare term would have to achieve a hundred-fold increase in popularity to
        // achieve the same difference measure.
        // In favouring common words as suggested features for search we would get high
        // recall but low precision.
        double absoluteProbabilityChange = subsetProbability - supersetProbability;
        if (absoluteProbabilityChange <= 0) {
            return 0;
        }
        // Using relativeProbabilityChange tends to favour rarer terms e.g.mis-spellings or
        // unique URLs.
        // A very low-probability term can very easily double in popularity due to the low
        // numbers required to do so whereas a high-probability term would have to add many
        // extra individual sightings to achieve the same shift.
        // In favouring rare words as suggested features for search we would get high
        // precision but low recall.
        double relativeProbabilityChange = (subsetProbability / supersetProbability);

        // A blend of the above metrics - favours medium-rare terms to strike a useful
        // balance between precision and recall.
        return absoluteProbabilityChange * relativeProbabilityChange;
    }

    @Override
    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
        builder.startObject(NAME).endObject();
        return builder;
    }

    public static SignificanceHeuristic parse(XContentParser parser)
            throws IOException, QueryShardException {
        // move to the closing bracket
        if (!parser.nextToken().equals(XContentParser.Token.END_OBJECT)) {
            throw new ElasticsearchParseException(
                    "failed to parse [jlh] significance heuristic. expected an empty object, but found [{}] instead",
                    parser.currentToken());
        }
        return new JLHScore();
    }

    @Override
    public boolean equals(Object obj) {
        if (obj == null || obj.getClass() != getClass()) {
            return false;
        }
        return true;
    }

    @Override
    public int hashCode() {
        return getClass().hashCode();
    }

    public static class JLHScoreBuilder implements SignificanceHeuristicBuilder {

        @Override
        public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
            builder.startObject(NAME).endObject();
            return builder;
        }
    }
}