core/src/main/java/org/apache/lucene/search/grouping/CollapsingTopDocsCollector.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.lucene.search.grouping;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.util.BytesRef;

import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;

import static org.apache.lucene.search.SortField.Type.SCORE;

/**
 * A collector that groups documents based on field values and returns {@link CollapseTopFieldDocs}
 * output. The collapsing is done in a single pass by selecting only the top sorted document per collapse key.
 * The value used for the collapse key of each group can be found in {@link CollapseTopFieldDocs#collapseValues}.
 */
public abstract class CollapsingTopDocsCollector<T> extends FirstPassGroupingCollector<T> {
    protected final String collapseField;

    protected final Sort sort;
    protected Scorer scorer;

    private int totalHitCount;
    private float maxScore;
    private final boolean trackMaxScore;

    private CollapsingTopDocsCollector(String collapseField, Sort sort,
                                       int topN, boolean trackMaxScore) throws IOException {
        super(sort, topN);
        this.collapseField = collapseField;
        this.trackMaxScore = trackMaxScore;
        if (trackMaxScore) {
            maxScore = Float.NEGATIVE_INFINITY;
        } else {
            maxScore = Float.NaN;
        }
        this.sort = sort;
    }

    /**
     * Transform {@link FirstPassGroupingCollector#getTopGroups(int, boolean)} output in
     * {@link CollapseTopFieldDocs}. The collapsing needs only one pass so we can create the final top docs at the end
     * of the first pass.
     */
    public CollapseTopFieldDocs getTopDocs() {
        Collection<SearchGroup<T>> groups = super.getTopGroups(0, true);
        if (groups == null) {
            return new CollapseTopFieldDocs(collapseField, totalHitCount, new ScoreDoc[0],
                sort.getSort(), new Object[0], Float.NaN);
        }
        FieldDoc[] docs = new FieldDoc[groups.size()];
        Object[] collapseValues = new Object[groups.size()];
        int scorePos = -1;
        for (int index = 0; index < sort.getSort().length; index++) {
            SortField sortField = sort.getSort()[index];
            if (sortField.getType() == SCORE) {
                scorePos = index;
                break;
            }
        }
        int pos = 0;
        Iterator<CollectedSearchGroup<T>> it = orderedGroups.iterator();
        for (SearchGroup<T> group : groups) {
            assert it.hasNext();
            CollectedSearchGroup<T> col = it.next();
            float score = Float.NaN;
            if (scorePos != -1) {
                score = (float) group.sortValues[scorePos];
            }
            docs[pos] = new FieldDoc(col.topDoc, score, group.sortValues);
            collapseValues[pos] = group.groupValue;
            pos++;
        }
        return new CollapseTopFieldDocs(collapseField, totalHitCount, docs, sort.getSort(),
            collapseValues, maxScore);
    }

    @Override
    public boolean needsScores() {
        if (super.needsScores() == false) {
            return trackMaxScore;
        }
        return true;
    }

    @Override
    public void setScorer(Scorer scorer) throws IOException {
        super.setScorer(scorer);
        this.scorer = scorer;
    }

    @Override
    public void collect(int doc) throws IOException {
        super.collect(doc);
        if (trackMaxScore) {
            maxScore = Math.max(maxScore, scorer.score());
        }
        totalHitCount++;
    }

    private static class Numeric extends CollapsingTopDocsCollector<Long> {
        private final CollapsingDocValuesSource.Numeric source;

        private Numeric(String collapseField, Sort sort, int topN, boolean trackMaxScore) throws IOException {
            super(collapseField, sort, topN, trackMaxScore);
            source = new CollapsingDocValuesSource.Numeric(collapseField);
        }

        @Override
        protected void doSetNextReader(LeafReaderContext readerContext) throws IOException {
            super.doSetNextReader(readerContext);
            source.setNextReader(readerContext.reader());
        }

        @Override
        protected Long getDocGroupValue(int doc) {
            return source.get(doc);
        }

        @Override
        protected Long copyDocGroupValue(Long groupValue, Long reuse) {
            return source.copy(groupValue, reuse);
        }
    }

    private static class Keyword extends CollapsingTopDocsCollector<BytesRef> {
        private final CollapsingDocValuesSource.Keyword source;

        private Keyword(String collapseField, Sort sort, int topN, boolean trackMaxScore) throws IOException {
            super(collapseField, sort, topN, trackMaxScore);
            source = new CollapsingDocValuesSource.Keyword(collapseField);

        }

        @Override
        protected void doSetNextReader(LeafReaderContext readerContext) throws IOException {
            super.doSetNextReader(readerContext);
            source.setNextReader(readerContext.reader());
        }

        @Override
        protected BytesRef getDocGroupValue(int doc) {
            return source.get(doc);
        }

        @Override
        protected BytesRef copyDocGroupValue(BytesRef groupValue, BytesRef reuse) {
            return source.copy(groupValue, reuse);
        }
    }

    /**
     * Create a collapsing top docs collector on a {@link org.apache.lucene.index.NumericDocValues} field.
     * It accepts also {@link org.apache.lucene.index.SortedNumericDocValues} field but
     * the collect will fail with an {@link IllegalStateException} if a document contains more than one value for the
     * field.
     *
     * @param collapseField The sort field used to group
     *                      documents.
     * @param sort          The {@link Sort} used to sort the collapsed hits.
     *                      The collapsing keeps only the top sorted document per collapsed key.
     *                      This must be non-null, ie, if you want to groupSort by relevance
     *                      use Sort.RELEVANCE.
     * @param topN          How many top groups to keep.
     * @throws IOException When I/O related errors occur
     */
    public static CollapsingTopDocsCollector<?> createNumeric(String collapseField, Sort sort,
                                                              int topN, boolean trackMaxScore) throws IOException {
        return new Numeric(collapseField, sort, topN, trackMaxScore);
    }

    /**
     * Create a collapsing top docs collector on a {@link org.apache.lucene.index.SortedDocValues} field.
     * It accepts also {@link org.apache.lucene.index.SortedSetDocValues} field but
     * the collect will fail with an {@link IllegalStateException} if a document contains more than one value for the
     * field.
     *
     * @param collapseField The sort field used to group
     *                      documents.
     * @param sort          The {@link Sort} used to sort the collapsed hits. The collapsing keeps only the top sorted
     *                      document per collapsed key.
     *                      This must be non-null, ie, if you want to groupSort by relevance use Sort.RELEVANCE.
     * @param topN          How many top groups to keep.
     * @throws IOException When I/O related errors occur
     */
    public static CollapsingTopDocsCollector<?> createKeyword(String collapseField, Sort sort,
                                                              int topN, boolean trackMaxScore) throws IOException {
        return new Keyword(collapseField, sort, topN, trackMaxScore);
    }
}