core/src/main/java/org/apache/lucene/search/grouping/CollapsingTopDocsCollector.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.lucene.search.grouping;

import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;

import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;

import static org.apache.lucene.search.SortField.Type.SCORE;

/**
 * A collector that groups documents based on field values and returns {@link CollapseTopFieldDocs}
 * output. The collapsing is done in a single pass by selecting only the top sorted document per collapse key.
 * The value used for the collapse key of each group can be found in {@link CollapseTopFieldDocs#collapseValues}.
 */
public final class CollapsingTopDocsCollector<T> extends FirstPassGroupingCollector<T> {
    protected final String collapseField;

    protected final Sort sort;
    protected Scorer scorer;

    private int totalHitCount;
    private float maxScore;
    private final boolean trackMaxScore;

    CollapsingTopDocsCollector(GroupSelector<T> groupSelector, String collapseField, Sort sort,
                                       int topN, boolean trackMaxScore) throws IOException {
        super(groupSelector, sort, topN);
        this.collapseField = collapseField;
        this.trackMaxScore = trackMaxScore;
        if (trackMaxScore) {
            maxScore = Float.NEGATIVE_INFINITY;
        } else {
            maxScore = Float.NaN;
        }
        this.sort = sort;
    }

    /**
     * Transform {@link FirstPassGroupingCollector#getTopGroups(int, boolean)} output in
     * {@link CollapseTopFieldDocs}. The collapsing needs only one pass so we can create the final top docs at the end
     * of the first pass.
     */
    public CollapseTopFieldDocs getTopDocs() throws IOException {
        Collection<SearchGroup<T>> groups = super.getTopGroups(0, true);
        if (groups == null) {
            return new CollapseTopFieldDocs(collapseField, totalHitCount, new ScoreDoc[0],
                sort.getSort(), new Object[0], Float.NaN);
        }
        FieldDoc[] docs = new FieldDoc[groups.size()];
        Object[] collapseValues = new Object[groups.size()];
        int scorePos = -1;
        for (int index = 0; index < sort.getSort().length; index++) {
            SortField sortField = sort.getSort()[index];
            if (sortField.getType() == SCORE) {
                scorePos = index;
                break;
            }
        }
        int pos = 0;
        Iterator<CollectedSearchGroup<T>> it = orderedGroups.iterator();
        for (SearchGroup<T> group : groups) {
            assert it.hasNext();
            CollectedSearchGroup<T> col = it.next();
            float score = Float.NaN;
            if (scorePos != -1) {
                score = (float) group.sortValues[scorePos];
            }
            docs[pos] = new FieldDoc(col.topDoc, score, group.sortValues);
            collapseValues[pos] = group.groupValue;
            pos++;
        }
        return new CollapseTopFieldDocs(collapseField, totalHitCount, docs, sort.getSort(),
            collapseValues, maxScore);
    }

    @Override
    public boolean needsScores() {
        if (super.needsScores() == false) {
            return trackMaxScore;
        }
        return true;
    }

    @Override
    public void setScorer(Scorer scorer) throws IOException {
        super.setScorer(scorer);
        this.scorer = scorer;
    }

    @Override
    public void collect(int doc) throws IOException {
        super.collect(doc);
        if (trackMaxScore) {
            maxScore = Math.max(maxScore, scorer.score());
        }
        totalHitCount++;
    }

    /**
     * Create a collapsing top docs collector on a {@link org.apache.lucene.index.NumericDocValues} field.
     * It accepts also {@link org.apache.lucene.index.SortedNumericDocValues} field but
     * the collect will fail with an {@link IllegalStateException} if a document contains more than one value for the
     * field.
     *
     * @param collapseField The sort field used to group
     *                      documents.
     * @param sort          The {@link Sort} used to sort the collapsed hits.
     *                      The collapsing keeps only the top sorted document per collapsed key.
     *                      This must be non-null, ie, if you want to groupSort by relevance
     *                      use Sort.RELEVANCE.
     * @param topN          How many top groups to keep.
     * @throws IOException When I/O related errors occur
     */
    public static CollapsingTopDocsCollector<?> createNumeric(String collapseField, Sort sort,
                                                              int topN, boolean trackMaxScore) throws IOException {
        return new CollapsingTopDocsCollector<>(new CollapsingDocValuesSource.Numeric(collapseField),
                collapseField, sort, topN, trackMaxScore);
    }

    /**
     * Create a collapsing top docs collector on a {@link org.apache.lucene.index.SortedDocValues} field.
     * It accepts also {@link org.apache.lucene.index.SortedSetDocValues} field but
     * the collect will fail with an {@link IllegalStateException} if a document contains more than one value for the
     * field.
     *
     * @param collapseField The sort field used to group
     *                      documents.
     * @param sort          The {@link Sort} used to sort the collapsed hits. The collapsing keeps only the top sorted
     *                      document per collapsed key.
     *                      This must be non-null, ie, if you want to groupSort by relevance use Sort.RELEVANCE.
     * @param topN          How many top groups to keep.
     * @throws IOException When I/O related errors occur
     */
    public static CollapsingTopDocsCollector<?> createKeyword(String collapseField, Sort sort,
                                                              int topN, boolean trackMaxScore) throws IOException {
        return new CollapsingTopDocsCollector<>(new CollapsingDocValuesSource.Keyword(collapseField),
                collapseField, sort, topN, trackMaxScore);
    }
}