diff options
author | Simon Willnauer <simonw@apache.org> | 2015-06-05 13:12:03 +0200 |
---|---|---|
committer | Simon Willnauer <simonw@apache.org> | 2015-06-05 13:12:03 +0200 |
commit | 15a62448343fd24f8e63f43b1e4b16f50005e4a5 (patch) | |
tree | 7d04660f3f7aef0d679da3e6185af9cf378bf1d0 /core/src/main/java/org/elasticsearch/search/suggest/completion | |
parent | 7ccc193a666e2ae888e7ac93d677a2143e5e07c3 (diff) |
create core module
Diffstat (limited to 'core/src/main/java/org/elasticsearch/search/suggest/completion')
11 files changed, 1767 insertions, 0 deletions
diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProvider.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProvider.java new file mode 100644 index 0000000000..879b51a6fe --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/AnalyzingCompletionLookupProvider.java @@ -0,0 +1,389 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest.completion; + +import com.carrotsearch.hppc.ObjectLongHashMap; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester; +import org.apache.lucene.search.suggest.analyzing.XFuzzySuggester; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PairOutputs; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.elasticsearch.common.regex.Regex; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.CompletionLookupProvider; +import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory; +import org.elasticsearch.search.suggest.context.ContextMapping.ContextQuery; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider { + + // for serialization + public static final int SERIALIZE_PRESERVE_SEPARATORS = 1; + public static final int SERIALIZE_HAS_PAYLOADS = 2; + public static final int SERIALIZE_PRESERVE_POSITION_INCREMENTS = 4; + + private static final int MAX_SURFACE_FORMS_PER_ANALYZED_FORM = 256; + private static final int MAX_GRAPH_EXPANSIONS = -1; + + public static final String CODEC_NAME = "analyzing"; + public static final int CODEC_VERSION_START = 1; + public static final int CODEC_VERSION_SERIALIZED_LABELS = 2; + public static final int CODEC_VERSION_CHECKSUMS = 3; + public static final int CODEC_VERSION_LATEST = CODEC_VERSION_CHECKSUMS; + + private boolean preserveSep; + private boolean preservePositionIncrements; + private int maxSurfaceFormsPerAnalyzedForm; + private int maxGraphExpansions; + private boolean hasPayloads; + private final XAnalyzingSuggester prototype; + + public AnalyzingCompletionLookupProvider(boolean preserveSep, boolean exactFirst, boolean preservePositionIncrements, boolean hasPayloads) { + this.preserveSep = preserveSep; + this.preservePositionIncrements = preservePositionIncrements; + this.hasPayloads = hasPayloads; + this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM; + this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS; + int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0; + // needs to fixed in the suggester first before it can be supported + //options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0; + prototype = new XAnalyzingSuggester(null, null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER); + } + + @Override + public String getName() { + return "analyzing"; + } + + @Override + public FieldsConsumer consumer(final IndexOutput output) throws IOException { + CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); + return new FieldsConsumer() { + private Map<String, Long> fieldOffsets = new HashMap<>(); + + @Override + public void close() throws IOException { + try { + /* + * write the offsets per field such that we know where + * we need to load the FSTs from + */ + long pointer = output.getFilePointer(); + output.writeVInt(fieldOffsets.size()); + for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { + output.writeString(entry.getKey()); + output.writeVLong(entry.getValue()); + } + output.writeLong(pointer); + CodecUtil.writeFooter(output); + } finally { + IOUtils.close(output); + } + } + + @Override + public void write(Fields fields) throws IOException { + for(String field : fields) { + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + TermsEnum termsEnum = terms.iterator(); + PostingsEnum docsEnum = null; + final SuggestPayload spare = new SuggestPayload(); + int maxAnalyzedPathsForOneInput = 0; + final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); + int docCount = 0; + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS); + builder.startTerm(term); + int docFreq = 0; + while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + for (int i = 0; i < docsEnum.freq(); i++) { + final int position = docsEnum.nextPosition(); + AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare); + builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); + // multi fields have the same surface form so we sum up here + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); + } + docFreq++; + docCount = Math.max(docCount, docsEnum.docID()+1); + } + builder.finishTerm(docFreq); + } + /* + * Here we are done processing the field and we can + * buid the FST and write it to disk. + */ + FST<Pair<Long, BytesRef>> build = builder.build(); + assert build != null || docCount == 0: "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; + /* + * it's possible that the FST is null if we have 2 segments that get merged + * and all docs that have a value in this field are deleted. This will cause + * a consumer to be created but it doesn't consume any values causing the FSTBuilder + * to return null. + */ + if (build != null) { + fieldOffsets.put(field, output.getFilePointer()); + build.save(output); + /* write some more meta-info */ + output.writeVInt(maxAnalyzedPathsForOneInput); + output.writeVInt(maxSurfaceFormsPerAnalyzedForm); + output.writeInt(maxGraphExpansions); // can be negative + int options = 0; + options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; + options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; + options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; + output.writeVInt(options); + output.writeVInt(XAnalyzingSuggester.SEP_LABEL); + output.writeVInt(XAnalyzingSuggester.END_BYTE); + output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); + output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); + } + } + } + }; + } + + + @Override + public LookupFactory load(IndexInput input) throws IOException { + long sizeInBytes = 0; + int version = CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION_START, CODEC_VERSION_LATEST); + if (version >= CODEC_VERSION_CHECKSUMS) { + CodecUtil.checksumEntireFile(input); + } + final long metaPointerPosition = input.length() - (version >= CODEC_VERSION_CHECKSUMS? 8 + CodecUtil.footerLength() : 8); + final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<>(); + input.seek(metaPointerPosition); + long metaPointer = input.readLong(); + input.seek(metaPointer); + int numFields = input.readVInt(); + + Map<Long, String> meta = new TreeMap<>(); + for (int i = 0; i < numFields; i++) { + String name = input.readString(); + long offset = input.readVLong(); + meta.put(offset, name); + } + + for (Map.Entry<Long, String> entry : meta.entrySet()) { + input.seek(entry.getKey()); + FST<Pair<Long, BytesRef>> fst = new FST<>(input, new PairOutputs<>( + PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); + int maxAnalyzedPathsForOneInput = input.readVInt(); + int maxSurfaceFormsPerAnalyzedForm = input.readVInt(); + int maxGraphExpansions = input.readInt(); + int options = input.readVInt(); + boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPARATORS) != 0; + boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0; + boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0; + + // first version did not include these three fields, so fall back to old default (before the analyzingsuggester + // was updated in Lucene, so we cannot use the suggester defaults) + int sepLabel, payloadSep, endByte, holeCharacter; + switch (version) { + case CODEC_VERSION_START: + sepLabel = 0xFF; + payloadSep = '\u001f'; + endByte = 0x0; + holeCharacter = '\u001E'; + break; + default: + sepLabel = input.readVInt(); + endByte = input.readVInt(); + payloadSep = input.readVInt(); + holeCharacter = input.readVInt(); + } + + AnalyzingSuggestHolder holder = new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, + hasPayloads, maxAnalyzedPathsForOneInput, fst, sepLabel, payloadSep, endByte, holeCharacter); + sizeInBytes += fst.ramBytesUsed(); + lookupMap.put(entry.getValue(), holder); + } + final long ramBytesUsed = sizeInBytes; + return new LookupFactory() { + @Override + public Lookup getLookup(CompletionFieldMapper mapper, CompletionSuggestionContext suggestionContext) { + AnalyzingSuggestHolder analyzingSuggestHolder = lookupMap.get(mapper.fieldType().names().indexName()); + if (analyzingSuggestHolder == null) { + return null; + } + int flags = analyzingSuggestHolder.getPreserveSeparator() ? XAnalyzingSuggester.PRESERVE_SEP : 0; + + final XAnalyzingSuggester suggester; + final Automaton queryPrefix = mapper.requiresContext() ? ContextQuery.toAutomaton(analyzingSuggestHolder.getPreserveSeparator(), suggestionContext.getContextQueries()) : null; + + if (suggestionContext.isFuzzy()) { + suggester = new XFuzzySuggester(mapper.fieldType().indexAnalyzer(), queryPrefix, mapper.fieldType().searchAnalyzer(), flags, + analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions, + suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(), + suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), suggestionContext.isFuzzyUnicodeAware(), + analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads, + analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte, + analyzingSuggestHolder.holeCharacter); + } else { + suggester = new XAnalyzingSuggester(mapper.fieldType().indexAnalyzer(), queryPrefix, mapper.fieldType().searchAnalyzer(), flags, + analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions, + analyzingSuggestHolder.preservePositionIncrements, analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads, + analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte, + analyzingSuggestHolder.holeCharacter); + } + return suggester; + } + + @Override + public CompletionStats stats(String... fields) { + long sizeInBytes = 0; + ObjectLongHashMap<String> completionFields = null; + if (fields != null && fields.length > 0) { + completionFields = new ObjectLongHashMap<>(fields.length); + } + + for (Map.Entry<String, AnalyzingSuggestHolder> entry : lookupMap.entrySet()) { + sizeInBytes += entry.getValue().fst.ramBytesUsed(); + if (fields == null || fields.length == 0) { + continue; + } + if (Regex.simpleMatch(fields, entry.getKey())) { + long fstSize = entry.getValue().fst.ramBytesUsed(); + completionFields.addTo(entry.getKey(), fstSize); + } + } + + return new CompletionStats(sizeInBytes, completionFields); + } + + @Override + AnalyzingSuggestHolder getAnalyzingSuggestHolder(CompletionFieldMapper mapper) { + return lookupMap.get(mapper.fieldType().names().indexName()); + } + + @Override + public long ramBytesUsed() { + return ramBytesUsed; + } + + @Override + public Collection<Accountable> getChildResources() { + return Accountables.namedAccountables("field", lookupMap); + } + }; + } + + static class AnalyzingSuggestHolder implements Accountable { + final boolean preserveSep; + final boolean preservePositionIncrements; + final int maxSurfaceFormsPerAnalyzedForm; + final int maxGraphExpansions; + final boolean hasPayloads; + final int maxAnalyzedPathsForOneInput; + final FST<Pair<Long, BytesRef>> fst; + final int sepLabel; + final int payloadSep; + final int endByte; + final int holeCharacter; + + public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, + boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) { + this(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput, fst, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER); + } + + public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, int endByte, int holeCharacter) { + this.preserveSep = preserveSep; + this.preservePositionIncrements = preservePositionIncrements; + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + this.maxGraphExpansions = maxGraphExpansions; + this.hasPayloads = hasPayloads; + this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput; + this.fst = fst; + this.sepLabel = sepLabel; + this.payloadSep = payloadSep; + this.endByte = endByte; + this.holeCharacter = holeCharacter; + } + + public boolean getPreserveSeparator() { + return preserveSep; + } + + public boolean getPreservePositionIncrements() { + return preservePositionIncrements; + } + + public boolean hasPayloads() { + return hasPayloads; + } + + @Override + public long ramBytesUsed() { + if (fst != null) { + return fst.ramBytesUsed(); + } else { + return 0; + } + } + + @Override + public Collection<Accountable> getChildResources() { + if (fst != null) { + return Collections.singleton(Accountables.namedAccountable("fst", fst)); + } else { + return Collections.emptyList(); + } + } + } + + @Override + public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { + return prototype.toFiniteStrings(prototype.getTokenStreamToAutomaton(), stream); + } +}
\ No newline at end of file diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java new file mode 100644 index 0000000000..5ffe9501dc --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/Completion090PostingsFormat.java @@ -0,0 +1,347 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMap.Builder; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FilterLeafReader.FilterTerms; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.store.IOContext.Context; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.Loggers; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.search.suggest.completion.CompletionTokenStream.ToFiniteStrings; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * This {@link PostingsFormat} is basically a T-Sink for a default postings + * format that is used to store postings on disk fitting the lucene APIs and + * builds a suggest FST as an auxiliary data structure next to the actual + * postings format. It uses the delegate postings format for simplicity to + * handle all the merge operations. The auxiliary suggest FST data structure is + * only loaded if a FieldsProducer is requested for reading, for merging it uses + * the low memory delegate postings format. + */ +public class Completion090PostingsFormat extends PostingsFormat { + + public static final String CODEC_NAME = "completion090"; + public static final int SUGGEST_CODEC_VERSION = 1; + public static final int SUGGEST_VERSION_CURRENT = SUGGEST_CODEC_VERSION; + public static final String EXTENSION = "cmp"; + + private final static ESLogger logger = Loggers.getLogger(Completion090PostingsFormat.class); + private PostingsFormat delegatePostingsFormat; + private final static Map<String, CompletionLookupProvider> providers; + private CompletionLookupProvider writeProvider; + + + static { + final CompletionLookupProvider provider = new AnalyzingCompletionLookupProvider(true, false, true, false); + final Builder<String, CompletionLookupProvider> builder = ImmutableMap.builder(); + providers = builder.put(provider.getName(), provider).build(); + } + + public Completion090PostingsFormat(PostingsFormat delegatePostingsFormat, CompletionLookupProvider provider) { + super(CODEC_NAME); + this.delegatePostingsFormat = delegatePostingsFormat; + this.writeProvider = provider; + assert delegatePostingsFormat != null && writeProvider != null; + } + + /* + * Used only by core Lucene at read-time via Service Provider instantiation + * do not use at Write-time in application code. + */ + public Completion090PostingsFormat() { + super(CODEC_NAME); + } + + @Override + public CompletionFieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + if (delegatePostingsFormat == null) { + throw new UnsupportedOperationException("Error - " + getClass().getName() + + " has been constructed without a choice of PostingsFormat"); + } + assert writeProvider != null; + return new CompletionFieldsConsumer(state); + } + + @Override + public CompletionFieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + return new CompletionFieldsProducer(state); + } + + private class CompletionFieldsConsumer extends FieldsConsumer { + + private FieldsConsumer delegatesFieldsConsumer; + private FieldsConsumer suggestFieldsConsumer; + + public CompletionFieldsConsumer(SegmentWriteState state) throws IOException { + this.delegatesFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state); + String suggestFSTFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION); + IndexOutput output = null; + boolean success = false; + try { + output = state.directory.createOutput(suggestFSTFile, state.context); + CodecUtil.writeHeader(output, CODEC_NAME, SUGGEST_VERSION_CURRENT); + /* + * we write the delegate postings format name so we can load it + * without getting an instance in the ctor + */ + output.writeString(delegatePostingsFormat.getName()); + output.writeString(writeProvider.getName()); + this.suggestFieldsConsumer = writeProvider.consumer(output); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(output); + } + } + } + + @Override + public void write(Fields fields) throws IOException { + delegatesFieldsConsumer.write(fields); + suggestFieldsConsumer.write(fields); + } + + @Override + public void close() throws IOException { + IOUtils.close(delegatesFieldsConsumer, suggestFieldsConsumer); + } + } + + private static class CompletionFieldsProducer extends FieldsProducer { + // TODO make this class lazyload all the things in order to take advantage of the new merge instance API + // today we just load everything up-front + private final FieldsProducer delegateProducer; + private final LookupFactory lookupFactory; + private final int version; + + public CompletionFieldsProducer(SegmentReadState state) throws IOException { + String suggestFSTFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION); + IndexInput input = state.directory.openInput(suggestFSTFile, state.context); + version = CodecUtil.checkHeader(input, CODEC_NAME, SUGGEST_CODEC_VERSION, SUGGEST_VERSION_CURRENT); + FieldsProducer delegateProducer = null; + boolean success = false; + try { + PostingsFormat delegatePostingsFormat = PostingsFormat.forName(input.readString()); + String providerName = input.readString(); + CompletionLookupProvider completionLookupProvider = providers.get(providerName); + if (completionLookupProvider == null) { + throw new IllegalStateException("no provider with name [" + providerName + "] registered"); + } + // TODO: we could clone the ReadState and make it always forward IOContext.MERGE to prevent unecessary heap usage? + delegateProducer = delegatePostingsFormat.fieldsProducer(state); + /* + * If we are merging we don't load the FSTs at all such that we + * don't consume so much memory during merge + */ + if (state.context.context != Context.MERGE) { + // TODO: maybe we can do this in a fully lazy fashion based on some configuration + // eventually we should have some kind of curciut breaker that prevents us from going OOM here + // with some configuration + this.lookupFactory = completionLookupProvider.load(input); + } else { + this.lookupFactory = null; + } + this.delegateProducer = delegateProducer; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(delegateProducer, input); + } else { + IOUtils.close(input); + } + } + } + + @Override + public void close() throws IOException { + IOUtils.close(delegateProducer); + } + + @Override + public Iterator<String> iterator() { + return delegateProducer.iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + final Terms terms = delegateProducer.terms(field); + if (terms == null || lookupFactory == null) { + return terms; + } + return new CompletionTerms(terms, lookupFactory); + } + + @Override + public int size() { + return delegateProducer.size(); + } + + @Override + public long ramBytesUsed() { + return (lookupFactory == null ? 0 : lookupFactory.ramBytesUsed()) + delegateProducer.ramBytesUsed(); + } + + @Override + public Collection<Accountable> getChildResources() { + List<Accountable> resources = new ArrayList<>(); + if (lookupFactory != null) { + resources.add(Accountables.namedAccountable("lookup", lookupFactory)); + } + resources.add(Accountables.namedAccountable("delegate", delegateProducer)); + return Collections.unmodifiableList(resources); + } + + @Override + public void checkIntegrity() throws IOException { + delegateProducer.checkIntegrity(); + } + + @Override + public FieldsProducer getMergeInstance() throws IOException { + return delegateProducer.getMergeInstance(); + } + } + + public static final class CompletionTerms extends FilterTerms { + private final LookupFactory lookup; + + public CompletionTerms(Terms delegate, LookupFactory lookup) { + super(delegate); + this.lookup = lookup; + } + + public Lookup getLookup(CompletionFieldMapper mapper, CompletionSuggestionContext suggestionContext) { + return lookup.getLookup(mapper, suggestionContext); + } + + public CompletionStats stats(String ... fields) { + return lookup.stats(fields); + } + } + + public static abstract class CompletionLookupProvider implements PayloadProcessor, ToFiniteStrings { + + public static final char UNIT_SEPARATOR = '\u001f'; + + public abstract FieldsConsumer consumer(IndexOutput output) throws IOException; + + public abstract String getName(); + + public abstract LookupFactory load(IndexInput input) throws IOException; + + @Override + public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException { + if (weight < -1 || weight > Integer.MAX_VALUE) { + throw new IllegalArgumentException("weight must be >= -1 && <= Integer.MAX_VALUE"); + } + for (int i = 0; i < surfaceForm.length; i++) { + if (surfaceForm.bytes[i] == UNIT_SEPARATOR) { + throw new IllegalArgumentException( + "surface form cannot contain unit separator character U+001F; this character is reserved"); + } + } + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream); + output.writeVLong(weight + 1); + output.writeVInt(surfaceForm.length); + output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); + output.writeVInt(payload.length); + output.writeBytes(payload.bytes, 0, payload.length); + + output.close(); + return new BytesRef(byteArrayOutputStream.toByteArray()); + } + + @Override + public void parsePayload(BytesRef payload, SuggestPayload ref) throws IOException { + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length); + InputStreamDataInput input = new InputStreamDataInput(byteArrayInputStream); + ref.weight = input.readVLong() - 1; + int len = input.readVInt(); + ref.surfaceForm.grow(len); + ref.surfaceForm.setLength(len); + input.readBytes(ref.surfaceForm.bytes(), 0, ref.surfaceForm.length()); + len = input.readVInt(); + ref.payload.grow(len); + ref.payload.setLength(len); + input.readBytes(ref.payload.bytes(), 0, ref.payload.length()); + input.close(); + } + } + + public CompletionStats completionStats(IndexReader indexReader, String ... fields) { + CompletionStats completionStats = new CompletionStats(); + for (LeafReaderContext atomicReaderContext : indexReader.leaves()) { + LeafReader atomicReader = atomicReaderContext.reader(); + try { + for (String fieldName : atomicReader.fields()) { + Terms terms = atomicReader.fields().terms(fieldName); + if (terms instanceof CompletionTerms) { + CompletionTerms completionTerms = (CompletionTerms) terms; + completionStats.add(completionTerms.stats(fields)); + } + } + } catch (IOException e) { + logger.error("Could not get completion stats: {}", e, e.getMessage()); + } + } + + return completionStats; + } + + public static abstract class LookupFactory implements Accountable { + public abstract Lookup getLookup(CompletionFieldMapper mapper, CompletionSuggestionContext suggestionContext); + public abstract CompletionStats stats(String ... fields); + abstract AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder getAnalyzingSuggestHolder(CompletionFieldMapper mapper); + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionStats.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionStats.java new file mode 100644 index 0000000000..5ec517dd8d --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionStats.java @@ -0,0 +1,154 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import com.carrotsearch.hppc.ObjectLongHashMap; +import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Streamable; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentBuilderString; + +import java.io.IOException; + +/** + * + */ +public class CompletionStats implements Streamable, ToXContent { + + private long sizeInBytes; + + @Nullable + private ObjectLongHashMap<String> fields; + + public CompletionStats() { + } + + public CompletionStats(long size, @Nullable ObjectLongHashMap<String> fields) { + this.sizeInBytes = size; + this.fields = fields; + } + + public long getSizeInBytes() { + return sizeInBytes; + } + + public ByteSizeValue getSize() { + return new ByteSizeValue(sizeInBytes); + } + + public ObjectLongHashMap<String> getFields() { + return fields; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + sizeInBytes = in.readVLong(); + if (in.readBoolean()) { + int size = in.readVInt(); + fields = new ObjectLongHashMap<>(size); + for (int i = 0; i < size; i++) { + fields.put(in.readString(), in.readVLong()); + } + } + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVLong(sizeInBytes); + if (fields == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + out.writeVInt(fields.size()); + + assert !fields.containsKey(null); + final Object[] keys = fields.keys; + final long[] values = fields.values; + for (int i = 0; i < keys.length; i++) { + if (keys[i] != null) { + out.writeString((String) keys[i]); + out.writeVLong(values[i]); + } + } + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(Fields.COMPLETION); + builder.byteSizeField(Fields.SIZE_IN_BYTES, Fields.SIZE, sizeInBytes); + if (fields != null) { + builder.startObject(Fields.FIELDS); + + assert !fields.containsKey(null); + final Object[] keys = fields.keys; + final long[] values = fields.values; + for (int i = 0; i < keys.length; i++) { + if (keys[i] != null) { + builder.startObject((String) keys[i], XContentBuilder.FieldCaseConversion.NONE); + builder.byteSizeField(Fields.SIZE_IN_BYTES, Fields.SIZE, values[i]); + builder.endObject(); + } + } + builder.endObject(); + } + builder.endObject(); + return builder; + } + + public static CompletionStats readCompletionStats(StreamInput in) throws IOException { + CompletionStats stats = new CompletionStats(); + stats.readFrom(in); + return stats; + } + + static final class Fields { + static final XContentBuilderString COMPLETION = new XContentBuilderString("completion"); + static final XContentBuilderString SIZE_IN_BYTES = new XContentBuilderString("size_in_bytes"); + static final XContentBuilderString SIZE = new XContentBuilderString("size"); + static final XContentBuilderString FIELDS = new XContentBuilderString("fields"); + } + + public void add(CompletionStats completion) { + if (completion == null) { + return; + } + + sizeInBytes += completion.getSizeInBytes(); + + if (completion.fields != null) { + if (fields == null) { + fields = completion.fields.clone(); + } else { + assert !completion.fields.containsKey(null); + final Object[] keys = completion.fields.keys; + final long[] values = completion.fields.values; + for (int i = 0; i < keys.length; i++) { + if (keys[i] != null) { + fields.addTo((String) keys[i], values[i]); + } + } + } + } + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestParser.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestParser.java new file mode 100644 index 0000000000..8ef271fb76 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestParser.java @@ -0,0 +1,122 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.index.query.IndexQueryParserService; +import org.elasticsearch.search.suggest.SuggestContextParser; +import org.elasticsearch.search.suggest.SuggestionSearchContext; +import org.elasticsearch.search.suggest.context.ContextMapping.ContextQuery; + +import java.io.IOException; +import java.util.List; + +import static org.elasticsearch.search.suggest.SuggestUtils.parseSuggestContext; + +/** + * + */ +public class CompletionSuggestParser implements SuggestContextParser { + + private CompletionSuggester completionSuggester; + private static final ParseField FUZZINESS = Fuzziness.FIELD.withDeprecation("edit_distance"); + + public CompletionSuggestParser(CompletionSuggester completionSuggester) { + this.completionSuggester = completionSuggester; + } + + @Override + public SuggestionSearchContext.SuggestionContext parse(XContentParser parser, MapperService mapperService, IndexQueryParserService queryParserService) throws IOException { + XContentParser.Token token; + String fieldName = null; + CompletionSuggestionContext suggestion = new CompletionSuggestionContext(completionSuggester); + + XContentParser contextParser = null; + + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + fieldName = parser.currentName(); + } else if (token.isValue()) { + if (!parseSuggestContext(parser, mapperService, fieldName, suggestion)) { + if (token == XContentParser.Token.VALUE_BOOLEAN && "fuzzy".equals(fieldName)) { + suggestion.setFuzzy(parser.booleanValue()); + } + } + } else if (token == XContentParser.Token.START_OBJECT) { + if("fuzzy".equals(fieldName)) { + suggestion.setFuzzy(true); + String fuzzyConfigName = null; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + fuzzyConfigName = parser.currentName(); + } else if (token.isValue()) { + if (FUZZINESS.match(fuzzyConfigName, ParseField.EMPTY_FLAGS)) { + suggestion.setFuzzyEditDistance(Fuzziness.parse(parser).asDistance()); + } else if ("transpositions".equals(fuzzyConfigName)) { + suggestion.setFuzzyTranspositions(parser.booleanValue()); + } else if ("min_length".equals(fuzzyConfigName) || "minLength".equals(fuzzyConfigName)) { + suggestion.setFuzzyMinLength(parser.intValue()); + } else if ("prefix_length".equals(fuzzyConfigName) || "prefixLength".equals(fuzzyConfigName)) { + suggestion.setFuzzyPrefixLength(parser.intValue()); + } else if ("unicode_aware".equals(fuzzyConfigName) || "unicodeAware".equals(fuzzyConfigName)) { + suggestion.setFuzzyUnicodeAware(parser.booleanValue()); + } + } + } + } else if("context".equals(fieldName)) { + // Copy the current structure. We will parse, once the mapping is provided + XContentBuilder builder = XContentFactory.contentBuilder(parser.contentType()); + builder.copyCurrentStructure(parser); + BytesReference bytes = builder.bytes(); + contextParser = parser.contentType().xContent().createParser(bytes); + } else { + throw new IllegalArgumentException("suggester [completion] doesn't support field [" + fieldName + "]"); + } + } else { + throw new IllegalArgumentException("suggester[completion] doesn't support field [" + fieldName + "]"); + } + } + + suggestion.mapper((CompletionFieldMapper)mapperService.smartNameFieldMapper(suggestion.getField())); + + CompletionFieldMapper mapper = suggestion.mapper(); + if (mapper != null) { + if (mapper.requiresContext()) { + if (contextParser == null) { + throw new IllegalArgumentException("suggester [completion] requires context to be setup"); + } else { + contextParser.nextToken(); + List<ContextQuery> contextQueries = ContextQuery.parseQueries(mapper.getContextMapping(), contextParser); + suggestion.setContextQuery(contextQueries); + } + } else if (contextParser != null) { + throw new IllegalArgumentException("suggester [completion] doesn't expect any context"); + } + } + return suggestion; + } + +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java new file mode 100644 index 0000000000..ee1cc70bc4 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java @@ -0,0 +1,120 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import com.google.common.collect.Maps; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.text.StringText; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.search.suggest.Suggest; +import org.elasticsearch.search.suggest.SuggestContextParser; +import org.elasticsearch.search.suggest.Suggester; +import org.elasticsearch.search.suggest.completion.CompletionSuggestion.Entry.Option; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +public class CompletionSuggester extends Suggester<CompletionSuggestionContext> { + + private static final ScoreComparator scoreComparator = new ScoreComparator(); + + + @Override + protected Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Suggest.Suggestion.Entry.Option>> innerExecute(String name, + CompletionSuggestionContext suggestionContext, IndexSearcher searcher, CharsRefBuilder spare) throws IOException { + if (suggestionContext.mapper() == null || !(suggestionContext.mapper() instanceof CompletionFieldMapper)) { + throw new ElasticsearchException("Field [" + suggestionContext.getField() + "] is not a completion suggest field"); + } + final IndexReader indexReader = searcher.getIndexReader(); + CompletionSuggestion completionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize()); + spare.copyUTF8Bytes(suggestionContext.getText()); + + CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry(new StringText(spare.toString()), 0, spare.length()); + completionSuggestion.addTerm(completionSuggestEntry); + + String fieldName = suggestionContext.getField(); + Map<String, CompletionSuggestion.Entry.Option> results = Maps.newHashMapWithExpectedSize(indexReader.leaves().size() * suggestionContext.getSize()); + for (LeafReaderContext atomicReaderContext : indexReader.leaves()) { + LeafReader atomicReader = atomicReaderContext.reader(); + Terms terms = atomicReader.fields().terms(fieldName); + if (terms instanceof Completion090PostingsFormat.CompletionTerms) { + final Completion090PostingsFormat.CompletionTerms lookupTerms = (Completion090PostingsFormat.CompletionTerms) terms; + final Lookup lookup = lookupTerms.getLookup(suggestionContext.mapper(), suggestionContext); + if (lookup == null) { + // we don't have a lookup for this segment.. this might be possible if a merge dropped all + // docs from the segment that had a value in this segment. + continue; + } + List<Lookup.LookupResult> lookupResults = lookup.lookup(spare.get(), false, suggestionContext.getSize()); + for (Lookup.LookupResult res : lookupResults) { + + final String key = res.key.toString(); + final float score = res.value; + final Option value = results.get(key); + if (value == null) { + final Option option = new CompletionSuggestion.Entry.Option(new StringText(key), score, res.payload == null ? null + : new BytesArray(res.payload)); + results.put(key, option); + } else if (value.getScore() < score) { + value.setScore(score); + value.setPayload(res.payload == null ? null : new BytesArray(res.payload)); + } + } + } + } + final List<CompletionSuggestion.Entry.Option> options = new ArrayList<>(results.values()); + CollectionUtil.introSort(options, scoreComparator); + + int optionCount = Math.min(suggestionContext.getSize(), options.size()); + for (int i = 0 ; i < optionCount ; i++) { + completionSuggestEntry.addOption(options.get(i)); + } + + return completionSuggestion; + } + + @Override + public String[] names() { + return new String[] { "completion" }; + } + + @Override + public SuggestContextParser getContextParser() { + return new CompletionSuggestParser(this); + } + + public static class ScoreComparator implements Comparator<CompletionSuggestion.Entry.Option> { + @Override + public int compare(Option o1, Option o2) { + return Float.compare(o2.getScore(), o1.getScore()); + } + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java new file mode 100644 index 0000000000..83515ff74f --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestion.java @@ -0,0 +1,136 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentHelper; +import org.elasticsearch.search.suggest.Suggest; + +import java.io.IOException; +import java.util.Map; + +/** + * + */ +public class CompletionSuggestion extends Suggest.Suggestion<CompletionSuggestion.Entry> { + + public static final int TYPE = 2; + + public CompletionSuggestion() { + } + + public CompletionSuggestion(String name, int size) { + super(name, size); + } + + @Override + public int getType() { + return TYPE; + } + + @Override + protected Entry newEntry() { + return new Entry(); + } + + public static class Entry extends org.elasticsearch.search.suggest.Suggest.Suggestion.Entry<CompletionSuggestion.Entry.Option> { + + public Entry(Text text, int offset, int length) { + super(text, offset, length); + } + + protected Entry() { + super(); + } + + @Override + protected Option newOption() { + return new Option(); + } + + public static class Option extends org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option { + private BytesReference payload; + + public Option(Text text, float score, BytesReference payload) { + super(text, score); + this.payload = payload; + } + + + protected Option() { + super(); + } + + public void setPayload(BytesReference payload) { + this.payload = payload; + } + + public BytesReference getPayload() { + return payload; + } + + public String getPayloadAsString() { + return payload.toUtf8(); + } + + public long getPayloadAsLong() { + return Long.parseLong(payload.toUtf8()); + } + + public double getPayloadAsDouble() { + return Double.parseDouble(payload.toUtf8()); + } + + public Map<String, Object> getPayloadAsMap() { + return XContentHelper.convertToMap(payload, false).v2(); + } + + @Override + public void setScore(float score) { + super.setScore(score); + } + + @Override + protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { + super.innerToXContent(builder, params); + if (payload != null && payload.length() > 0) { + builder.rawField("payload", payload); + } + return builder; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + super.readFrom(in); + payload = in.readBytesReference(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeBytesReference(payload); + } + } + } + +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java new file mode 100644 index 0000000000..15d04e845e --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java @@ -0,0 +1,42 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.suggest.SuggestBuilder; + +import java.io.IOException; + +/** + * Defines a suggest command based on a prefix, typically to provide "auto-complete" functionality + * for users as they type search terms. The implementation of the completion service uses FSTs that + * are created at index-time and so must be defined in the mapping with the type "completion" before + * indexing. + */ +public class CompletionSuggestionBuilder extends SuggestBuilder.SuggestionBuilder<CompletionSuggestionBuilder> { + + public CompletionSuggestionBuilder(String name) { + super(name, "completion"); + } + + @Override + protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { + return builder; + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java new file mode 100644 index 0000000000..a60ad16dd8 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java @@ -0,0 +1,111 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.apache.lucene.search.suggest.analyzing.XFuzzySuggester; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.search.suggest.Suggester; +import org.elasticsearch.search.suggest.SuggestionSearchContext; +import org.elasticsearch.search.suggest.context.ContextMapping.ContextQuery; + +import java.util.Collections; +import java.util.List; + +/** + * + */ +public class CompletionSuggestionContext extends SuggestionSearchContext.SuggestionContext { + + private CompletionFieldMapper mapper; + private int fuzzyEditDistance = XFuzzySuggester.DEFAULT_MAX_EDITS; + private boolean fuzzyTranspositions = XFuzzySuggester.DEFAULT_TRANSPOSITIONS; + private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH; + private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX; + private boolean fuzzy = false; + private boolean fuzzyUnicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE; + private List<ContextQuery> contextQueries = Collections.emptyList(); + + public CompletionSuggestionContext(Suggester suggester) { + super(suggester); + } + + public CompletionFieldMapper mapper() { + return this.mapper; + } + + public void mapper(CompletionFieldMapper mapper) { + this.mapper = mapper; + } + + public void setFuzzyEditDistance(int fuzzyEditDistance) { + this.fuzzyEditDistance = fuzzyEditDistance; + } + + public int getFuzzyEditDistance() { + return fuzzyEditDistance; + } + + public void setFuzzyTranspositions(boolean fuzzyTranspositions) { + this.fuzzyTranspositions = fuzzyTranspositions; + } + + public boolean isFuzzyTranspositions() { + return fuzzyTranspositions; + } + + public void setFuzzyMinLength(int fuzzyMinPrefixLength) { + this.fuzzyMinLength = fuzzyMinPrefixLength; + } + + public int getFuzzyMinLength() { + return fuzzyMinLength; + } + + public void setFuzzyPrefixLength(int fuzzyNonPrefixLength) { + this.fuzzyPrefixLength = fuzzyNonPrefixLength; + } + + public int getFuzzyPrefixLength() { + return fuzzyPrefixLength; + } + + public void setFuzzy(boolean fuzzy) { + this.fuzzy = fuzzy; + } + + public boolean isFuzzy() { + return fuzzy; + } + + public void setFuzzyUnicodeAware(boolean fuzzyUnicodeAware) { + this.fuzzyUnicodeAware = fuzzyUnicodeAware; + } + + public boolean isFuzzyUnicodeAware() { + return fuzzyUnicodeAware; + } + + public void setContextQuery(List<ContextQuery> queries) { + this.contextQueries = queries; + } + + public List<ContextQuery> getContextQueries() { + return this.contextQueries; + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionFuzzyBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionFuzzyBuilder.java new file mode 100644 index 0000000000..de6bf1365d --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionFuzzyBuilder.java @@ -0,0 +1,135 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.apache.lucene.search.suggest.analyzing.XFuzzySuggester; +import org.elasticsearch.common.unit.Fuzziness; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.search.suggest.SuggestBuilder; + +import java.io.IOException; + +/** + * A form of {@link CompletionSuggestionBuilder} that supports fuzzy queries allowing + * matches on typos. + * Various settings control when and how fuzziness is counted. + */ +public class CompletionSuggestionFuzzyBuilder extends SuggestBuilder.SuggestionBuilder<CompletionSuggestionFuzzyBuilder> { + + public CompletionSuggestionFuzzyBuilder(String name) { + super(name, "completion"); + } + + private Fuzziness fuzziness = Fuzziness.ONE; + private boolean fuzzyTranspositions = XFuzzySuggester.DEFAULT_TRANSPOSITIONS; + private int fuzzyMinLength = XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH; + private int fuzzyPrefixLength = XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX; + private boolean unicodeAware = XFuzzySuggester.DEFAULT_UNICODE_AWARE; + + public Fuzziness getFuzziness() { + return fuzziness; + } + + /** + * Sets the level of fuzziness used to create suggestions using a {@link Fuzziness} instance. + * The default value is {@link Fuzziness#ONE} which allows for an "edit distance" of one. + */ + public CompletionSuggestionFuzzyBuilder setFuzziness(Fuzziness fuzziness) { + this.fuzziness = fuzziness; + return this; + } + + public boolean isFuzzyTranspositions() { + return fuzzyTranspositions; + } + + /** + * Sets if transpositions (swapping one character for another) counts as one character + * change or two. + * Defaults to true, meaning it uses the fuzzier option of counting transpositions as + * a single change. + */ + public CompletionSuggestionFuzzyBuilder setFuzzyTranspositions(boolean fuzzyTranspositions) { + this.fuzzyTranspositions = fuzzyTranspositions; + return this; + } + + public int getFuzzyMinLength() { + return fuzzyMinLength; + } + + /** + * Sets the minimum length of input string before fuzzy suggestions are returned, defaulting + * to 3. + */ + public CompletionSuggestionFuzzyBuilder setFuzzyMinLength(int fuzzyMinLength) { + this.fuzzyMinLength = fuzzyMinLength; + return this; + } + + public int getFuzzyPrefixLength() { + return fuzzyPrefixLength; + } + + /** + * Sets the minimum length of the input, which is not checked for fuzzy alternatives, defaults to 1 + */ + public CompletionSuggestionFuzzyBuilder setFuzzyPrefixLength(int fuzzyPrefixLength) { + this.fuzzyPrefixLength = fuzzyPrefixLength; + return this; + } + + public boolean isUnicodeAware() { + return unicodeAware; + } + + /** + * Set to true if all measurements (like edit distance, transpositions and lengths) are in unicode + * code points (actual letters) instead of bytes. Default is false. + */ + public CompletionSuggestionFuzzyBuilder setUnicodeAware(boolean unicodeAware) { + this.unicodeAware = unicodeAware; + return this; + } + + @Override + protected XContentBuilder innerToXContent(XContentBuilder builder, ToXContent.Params params) throws IOException { + builder.startObject("fuzzy"); + + if (fuzziness != Fuzziness.ONE) { + fuzziness.toXContent(builder, params); + } + if (fuzzyTranspositions != XFuzzySuggester.DEFAULT_TRANSPOSITIONS) { + builder.field("transpositions", fuzzyTranspositions); + } + if (fuzzyMinLength != XFuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH) { + builder.field("min_length", fuzzyMinLength); + } + if (fuzzyPrefixLength != XFuzzySuggester.DEFAULT_NON_FUZZY_PREFIX) { + builder.field("prefix_length", fuzzyPrefixLength); + } + if (unicodeAware != XFuzzySuggester.DEFAULT_UNICODE_AWARE) { + builder.field("unicode_aware", unicodeAware); + } + + builder.endObject(); + return builder; + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java new file mode 100644 index 0000000000..103fd0dcf0 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java @@ -0,0 +1,173 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.*; +import org.apache.lucene.util.fst.Util; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Set; + +/** + * + */ +public final class CompletionTokenStream extends TokenStream { + + private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class); + private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class); + private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);; + + + private final TokenStream input; + private BytesRef payload; + private Iterator<IntsRef> finiteStrings; + private ToFiniteStrings toFiniteStrings; + private int posInc = -1; + private static final int MAX_PATHS = 256; + private CharTermAttribute charTermAttribute; + + public CompletionTokenStream(TokenStream input, BytesRef payload, ToFiniteStrings toFiniteStrings) throws IOException { + // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume + // the input stream entirely in toFiniteStrings(input) + this.input = input; + this.payload = payload; + this.toFiniteStrings = toFiniteStrings; + } + + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + if (finiteStrings == null) { + Set<IntsRef> strings = toFiniteStrings.toFiniteStrings(input); + + if (strings.size() > MAX_PATHS) { + throw new IllegalArgumentException("TokenStream expanded to " + strings.size() + " finite strings. Only <= " + MAX_PATHS + + " finite strings are supported"); + } + posInc = strings.size(); + finiteStrings = strings.iterator(); + } + if (finiteStrings.hasNext()) { + posAttr.setPositionIncrement(posInc); + /* + * this posInc encodes the number of paths that this surface form + * produced. Multi Fields have the same surface form and therefore sum up + */ + posInc = 0; + Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8 + if (charTermAttribute != null) { + charTermAttribute.setLength(0); + charTermAttribute.append(bytesAtt.toUTF16()); + } + if (payload != null) { + payloadAttr.setPayload(this.payload); + } + return true; + } + + return false; + } + + @Override + public void end() throws IOException { + super.end(); + if (posInc == -1) { + input.end(); + } + } + + @Override + public void close() throws IOException { + if (posInc == -1) { + input.close(); + } + } + + public static interface ToFiniteStrings { + public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException; + } + + @Override + public void reset() throws IOException { + super.reset(); + if (hasAttribute(CharTermAttribute.class)) { + // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion + charTermAttribute = getAttribute(CharTermAttribute.class); + } + finiteStrings = null; + posInc = -1; + } + + public interface ByteTermAttribute extends TermToBytesRefAttribute { + // marker interface + + /** + * Return the builder from which the term is derived. + */ + public BytesRefBuilder builder(); + + public CharSequence toUTF16(); + } + + public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute { + private final BytesRefBuilder bytes = new BytesRefBuilder(); + private CharsRefBuilder charsRef; + + @Override + public void fillBytesRef() { + // does nothing - we change in place + } + + @Override + public BytesRefBuilder builder() { + return bytes; + } + + @Override + public BytesRef getBytesRef() { + return bytes.get(); + } + + @Override + public void clear() { + bytes.clear(); + } + + @Override + public void copyTo(AttributeImpl target) { + ByteTermAttributeImpl other = (ByteTermAttributeImpl) target; + other.bytes.copyBytes(bytes); + } + + @Override + public CharSequence toUTF16() { + if (charsRef == null) { + charsRef = new CharsRefBuilder(); + } + charsRef.copyUTF8Bytes(getBytesRef()); + return charsRef.get(); + } + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/PayloadProcessor.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/PayloadProcessor.java new file mode 100644 index 0000000000..544d9052a0 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/PayloadProcessor.java @@ -0,0 +1,38 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest.completion; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +import java.io.IOException; + +interface PayloadProcessor { + + BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException; + + void parsePayload(BytesRef payload, SuggestPayload ref) throws IOException; + + static class SuggestPayload { + final BytesRefBuilder payload = new BytesRefBuilder(); + long weight = 0; + final BytesRefBuilder surfaceForm = new BytesRefBuilder(); + } +} |