diff options
Diffstat (limited to 'core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java')
-rw-r--r-- | core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java | 208 |
1 files changed, 208 insertions, 0 deletions
diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java new file mode 100644 index 0000000000..01f70d4b27 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java @@ -0,0 +1,208 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.fetch.subphase.highlight; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.search.highlight.Encoder; +import org.apache.lucene.search.highlight.Formatter; +import org.apache.lucene.search.highlight.Fragmenter; +import org.apache.lucene.search.highlight.NullFragmenter; +import org.apache.lucene.search.highlight.QueryScorer; +import org.apache.lucene.search.highlight.SimpleFragmenter; +import org.apache.lucene.search.highlight.SimpleHTMLFormatter; +import org.apache.lucene.search.highlight.SimpleSpanFragmenter; +import org.apache.lucene.search.highlight.TextFragment; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.ExceptionsHelper; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.search.fetch.FetchPhaseExecutionException; +import org.elasticsearch.search.fetch.FetchSubPhase; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * + */ +public class PlainHighlighter implements Highlighter { + + private static final String CACHE_KEY = "highlight-plain"; + + @Override + public HighlightField highlight(HighlighterContext highlighterContext) { + SearchContextHighlight.Field field = highlighterContext.field; + SearchContext context = highlighterContext.context; + FetchSubPhase.HitContext hitContext = highlighterContext.hitContext; + FieldMapper mapper = highlighterContext.mapper; + + Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; + + if (!hitContext.cache().containsKey(CACHE_KEY)) { + Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> mappers = new HashMap<>(); + hitContext.cache().put(CACHE_KEY, mappers); + } + @SuppressWarnings("unchecked") + Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter>) hitContext.cache().get(CACHE_KEY); + + org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper); + if (entry == null) { + QueryScorer queryScorer = new CustomQueryScorer(highlighterContext.query, field.fieldOptions().requireFieldMatch() ? mapper.fieldType().name() : null); + queryScorer.setExpandMultiTermQuery(true); + Fragmenter fragmenter; + if (field.fieldOptions().numberOfFragments() == 0) { + fragmenter = new NullFragmenter(); + } else if (field.fieldOptions().fragmenter() == null) { + fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); + } else if ("simple".equals(field.fieldOptions().fragmenter())) { + fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize()); + } else if ("span".equals(field.fieldOptions().fragmenter())) { + fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize()); + } else { + throw new IllegalArgumentException("unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field [" + highlighterContext.fieldName + "]"); + } + Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0]); + + entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer); + entry.setTextFragmenter(fragmenter); + // always highlight across all data + entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE); + + cache.put(mapper, entry); + } + + // a HACK to make highlighter do highlighting, even though its using the single frag list builder + int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments(); + ArrayList<TextFragment> fragsList = new ArrayList<>(); + List<Object> textsToHighlight; + Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().type()).mappers().indexAnalyzer(); + + try { + textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext); + + for (Object textToHighlight : textsToHighlight) { + String text = textToHighlight.toString(); + + try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) { + if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) { + // can't perform highlighting if the stream has no terms (binary token stream) or no offsets + continue; + } + TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments); + for (TextFragment bestTextFragment : bestTextFragments) { + if (bestTextFragment != null && bestTextFragment.getScore() > 0) { + fragsList.add(bestTextFragment); + } + } + } + } + } catch (Exception e) { + if (ExceptionsHelper.unwrap(e, BytesRefHash.MaxBytesLengthExceededException.class) != null) { + // this can happen if for example a field is not_analyzed and ignore_above option is set. + // the field will be ignored when indexing but the huge term is still in the source and + // the plain highlighter will parse the source and try to analyze it. + return null; + } else { + throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); + } + } + if (field.fieldOptions().scoreOrdered()) { + CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() { + @Override + public int compare(TextFragment o1, TextFragment o2) { + return Math.round(o2.getScore() - o1.getScore()); + } + }); + } + String[] fragments; + // number_of_fragments is set to 0 but we have a multivalued field + if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) { + fragments = new String[fragsList.size()]; + for (int i = 0; i < fragsList.size(); i++) { + fragments[i] = fragsList.get(i).toString(); + } + } else { + // refine numberOfFragments if needed + numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments; + fragments = new String[numberOfFragments]; + for (int i = 0; i < fragments.length; i++) { + fragments[i] = fragsList.get(i).toString(); + } + } + + if (fragments.length > 0) { + return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments)); + } + + int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize(); + if (noMatchSize > 0 && textsToHighlight.size() > 0) { + // Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary. + String fieldContents = textsToHighlight.get(0).toString(); + int end; + try { + end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer, mapper.fieldType().name(), fieldContents); + } catch (Exception e) { + throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e); + } + if (end > 0) { + return new HighlightField(highlighterContext.fieldName, new Text[] { new Text(fieldContents.substring(0, end)) }); + } + } + return null; + } + + @Override + public boolean canHighlight(FieldMapper fieldMapper) { + return true; + } + + private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException { + try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) { + if (!tokenStream.hasAttribute(OffsetAttribute.class)) { + // Can't split on term boundaries without offsets + return -1; + } + int end = -1; + tokenStream.reset(); + while (tokenStream.incrementToken()) { + OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); + if (attr.endOffset() >= noMatchSize) { + // Jump to the end of this token if it wouldn't put us past the boundary + if (attr.endOffset() == noMatchSize) { + end = noMatchSize; + } + return end; + } + end = attr.endOffset(); + } + tokenStream.end(); + // We've exhausted the token stream so we should just highlight everything. + return end; + } + } +} |