diff options
author | Areek Zillur <areek.zillur@elasticsearch.com> | 2016-04-07 15:34:39 -0400 |
---|---|---|
committer | Areek Zillur <areek.zillur@elasticsearch.com> | 2016-04-25 21:21:56 -0400 |
commit | 4a1a03428d7884c185b05c364c6f220237a9567e (patch) | |
tree | dce7bfb76bfa033c0961f60a7654426f2f288d26 /core/src/main/java/org | |
parent | d39eb2d69131d7a6a8bda29dcac31342bb1a712e (diff) |
Add bwc support for pre-5.0 completion index
This commit adds support for reading and querying
completion fields that were indexed in 2.x
Diffstat (limited to 'core/src/main/java/org')
19 files changed, 5087 insertions, 52 deletions
diff --git a/core/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java b/core/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java new file mode 100644 index 0000000000..a9327d785e --- /dev/null +++ b/core/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java @@ -0,0 +1,1214 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.search.suggest.analyzing; + +import com.carrotsearch.hppc.ObjectIntHashMap; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.search.suggest.InputIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.OfflineSorter; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FST.BytesReader; +import org.apache.lucene.util.fst.PairOutputs; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.fst.Util.Result; +import org.apache.lucene.util.fst.Util.TopResults; +import org.elasticsearch.common.SuppressForbidden; +import org.elasticsearch.common.collect.HppcMaps; +import org.elasticsearch.common.io.PathUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +/** + * Suggester that first analyzes the surface form, adds the + * analyzed form to a weighted FST, and then does the same + * thing at lookup time. This means lookup is based on the + * analyzed form while suggestions are still the surface + * form(s). + * + * <p> + * This can result in powerful suggester functionality. For + * example, if you use an analyzer removing stop words, + * then the partial text "ghost chr..." could see the + * suggestion "The Ghost of Christmas Past". Note that + * position increments MUST NOT be preserved for this example + * to work, so you should call the constructor with + * <code>preservePositionIncrements</code> parameter set to + * false + * + * <p> + * If SynonymFilter is used to map wifi and wireless network to + * hotspot then the partial text "wirele..." could suggest + * "wifi router". Token normalization like stemmers, accent + * removal, etc., would allow suggestions to ignore such + * variations. + * + * <p> + * When two matching suggestions have the same weight, they + * are tie-broken by the analyzed form. If their analyzed + * form is the same then the order is undefined. + * + * <p> + * There are some limitations: + * <ul> + * + * <li> A lookup from a query like "net" in English won't + * be any different than "net " (ie, user added a + * trailing space) because analyzers don't reflect + * when they've seen a token separator and when they + * haven't. + * + * <li> If you're using {@code StopFilter}, and the user will + * type "fast apple", but so far all they've typed is + * "fast a", again because the analyzer doesn't convey whether + * it's seen a token separator after the "a", + * {@code StopFilter} will remove that "a" causing + * far more matches than you'd expect. + * + * <li> Lookups with the empty string return no results + * instead of all results. + * </ul> + */ +public class XAnalyzingSuggester extends Lookup { + + /** + * FST<Weight,Surface>: + * input is the analyzed form, with a null byte between terms + * weights are encoded as costs: (Integer.MAX_VALUE-weight) + * surface is the original, unanalyzed form. + */ + private FST<Pair<Long,BytesRef>> fst = null; + + /** + * Analyzer that will be used for analyzing suggestions at + * index time. + */ + private final Analyzer indexAnalyzer; + + /** + * Analyzer that will be used for analyzing suggestions at + * query time. + */ + private final Analyzer queryAnalyzer; + + /** + * True if exact match suggestions should always be returned first. + */ + private final boolean exactFirst; + + /** + * True if separator between tokens should be preserved. + */ + private final boolean preserveSep; + + /** Include this flag in the options parameter to {@code + * #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to always + * return the exact match first, regardless of score. This + * has no performance impact but could result in + * low-quality suggestions. */ + public static final int EXACT_FIRST = 1; + + /** Include this flag in the options parameter to {@code + * #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to preserve + * token separators when matching. */ + public static final int PRESERVE_SEP = 2; + + /** Represents the separation between tokens, if + * PRESERVE_SEP was specified */ + public static final int SEP_LABEL = '\u001F'; + + /** Marks end of the analyzed input and start of dedup + * byte. */ + public static final int END_BYTE = 0x0; + + /** Maximum number of dup surface forms (different surface + * forms for the same analyzed form). */ + private final int maxSurfaceFormsPerAnalyzedForm; + + /** Maximum graph paths to index for a single analyzed + * surface form. This only matters if your analyzer + * makes lots of alternate paths (e.g. contains + * SynonymFilter). */ + private final int maxGraphExpansions; + + /** Highest number of analyzed paths we saw for any single + * input surface form. For analyzers that never create + * graphs this will always be 1. */ + private int maxAnalyzedPathsForOneInput; + + private boolean hasPayloads; + + private final int sepLabel; + private final int payloadSep; + private final int endByte; + private final int holeCharacter; + + public static final int PAYLOAD_SEP = '\u001F'; + public static final int HOLE_CHARACTER = '\u001E'; + + private final Automaton queryPrefix; + + /** Whether position holes should appear in the automaton. */ + private boolean preservePositionIncrements; + + /** Number of entries the lookup was built with */ + private long count = 0; + + /** + * Calls {@code #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int) + * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | + * PRESERVE_SEP, 256, -1)} + * + * @param analyzer Analyzer that will be used for analyzing suggestions while building the index. + */ + public XAnalyzingSuggester(Analyzer analyzer) { + this(analyzer, null, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, + SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER); + } + + /** + * Calls {@code #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int) + * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | + * PRESERVE_SEP, 256, -1)} + * + * @param indexAnalyzer Analyzer that will be used for analyzing suggestions while building the index. + * @param queryAnalyzer Analyzer that will be used for analyzing query text during lookup + */ + public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { + this(indexAnalyzer, null, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, + SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER); + } + + /** + * Creates a new suggester. + * + * @param indexAnalyzer Analyzer that will be used for + * analyzing suggestions while building the index. + * @param queryAnalyzer Analyzer that will be used for + * analyzing query text during lookup + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param maxSurfaceFormsPerAnalyzedForm Maximum number of + * surface forms to keep for a single analyzed form. + * When there are too many surface forms we discard the + * lowest weighted ones. + * @param maxGraphExpansions Maximum number of graph paths + * to expand from the analyzed form. Set this to -1 for + * no limit. + */ + public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, + int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, + boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, + boolean hasPayloads, int maxAnalyzedPathsForOneInput, + int sepLabel, int payloadSep, int endByte, int holeCharacter) { + // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput + this.indexAnalyzer = indexAnalyzer; + this.queryAnalyzer = queryAnalyzer; + this.fst = fst; + this.hasPayloads = hasPayloads; + if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) { + throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options); + } + this.exactFirst = (options & EXACT_FIRST) != 0; + this.preserveSep = (options & PRESERVE_SEP) != 0; + + // FLORIAN EDIT: I added <code>queryPrefix</code> for context dependent suggestions + this.queryPrefix = queryPrefix; + + // NOTE: this is just an implementation limitation; if + // somehow this is a problem we could fix it by using + // more than one byte to disambiguate ... but 256 seems + // like it should be way more then enough. + if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) { + throw new IllegalArgumentException( + "maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")"); + } + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + + if (maxGraphExpansions < 1 && maxGraphExpansions != -1) { + throw new IllegalArgumentException( + "maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")"); + } + this.maxGraphExpansions = maxGraphExpansions; + this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput; + this.preservePositionIncrements = preservePositionIncrements; + this.sepLabel = sepLabel; + this.payloadSep = payloadSep; + this.endByte = endByte; + this.holeCharacter = holeCharacter; + } + + /** Returns byte size of the underlying FST. */ + @Override +public long ramBytesUsed() { + return fst == null ? 0 : fst.ramBytesUsed(); + } + + public int getMaxAnalyzedPathsForOneInput() { + return maxAnalyzedPathsForOneInput; + } + + // Replaces SEP with epsilon or remaps them if + // we were asked to preserve them: + private Automaton replaceSep(Automaton a) { + + Automaton result = new Automaton(); + + // Copy all states over + int numStates = a.getNumStates(); + for(int s=0;s<numStates;s++) { + result.createState(); + result.setAccept(s, a.isAccept(s)); + } + + // Go in reverse topo sort so we know we only have to + // make one pass: + Transition t = new Transition(); + int[] topoSortStates = topoSortStates(a); + for(int i=0;i<topoSortStates.length;i++) { + int state = topoSortStates[topoSortStates.length-1-i]; + int count = a.initTransition(state, t); + for(int j=0;j<count;j++) { + a.getNextTransition(t); + if (t.min == TokenStreamToAutomaton.POS_SEP) { + assert t.max == TokenStreamToAutomaton.POS_SEP; + if (preserveSep) { + // Remap to SEP_LABEL: + result.addTransition(state, t.dest, SEP_LABEL); + } else { + result.addEpsilon(state, t.dest); + } + } else if (t.min == TokenStreamToAutomaton.HOLE) { + assert t.max == TokenStreamToAutomaton.HOLE; + + // Just remove the hole: there will then be two + // SEP tokens next to each other, which will only + // match another hole at search time. Note that + // it will also match an empty-string token ... if + // that's somehow a problem we can always map HOLE + // to a dedicated byte (and escape it in the + // input). + result.addEpsilon(state, t.dest); + } else { + result.addTransition(state, t.dest, t.min, t.max); + } + } + } + + result.finishState(); + + return result; + } + + protected Automaton convertAutomaton(Automaton a) { + if (queryPrefix != null) { + a = Operations.concatenate(Arrays.asList(queryPrefix, a)); + // This automaton should not blow up during determinize: + a = Operations.determinize(a, Integer.MAX_VALUE); + } + return a; + } + + private int[] topoSortStates(Automaton a) { + int[] states = new int[a.getNumStates()]; + final Set<Integer> visited = new HashSet<>(); + final LinkedList<Integer> worklist = new LinkedList<>(); + worklist.add(0); + visited.add(0); + int upto = 0; + states[upto] = 0; + upto++; + Transition t = new Transition(); + while (worklist.size() > 0) { + int s = worklist.removeFirst(); + int count = a.initTransition(s, t); + for (int i=0;i<count;i++) { + a.getNextTransition(t); + if (!visited.contains(t.dest)) { + visited.add(t.dest); + worklist.add(t.dest); + states[upto++] = t.dest; + } + } + } + return states; + } + + /** Just escapes the 0xff byte (which we still for SEP). */ + private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton { + + final BytesRefBuilder spare = new BytesRefBuilder(); + private char sepLabel; + + public EscapingTokenStreamToAutomaton(char sepLabel) { + this.sepLabel = sepLabel; + } + + @Override + protected BytesRef changeToken(BytesRef in) { + int upto = 0; + for(int i=0;i<in.length;i++) { + byte b = in.bytes[in.offset+i]; + if (b == (byte) sepLabel) { + spare.grow(upto+2); + spare.setByteAt(upto++, (byte) sepLabel); + spare.setByteAt(upto++, b); + } else { + spare.grow(upto+1); + spare.setByteAt(upto++, b); + } + } + spare.setLength(upto); + return spare.get(); + } + } + + public TokenStreamToAutomaton getTokenStreamToAutomaton() { + final TokenStreamToAutomaton tsta; + if (preserveSep) { + tsta = new EscapingTokenStreamToAutomaton((char) sepLabel); + } else { + // When we're not preserving sep, we don't steal 0xff + // byte, so we don't need to do any escaping: + tsta = new TokenStreamToAutomaton(); + } + tsta.setPreservePositionIncrements(preservePositionIncrements); + return tsta; + } + + private static class AnalyzingComparator implements Comparator<BytesRef> { + + private final boolean hasPayloads; + + public AnalyzingComparator(boolean hasPayloads) { + this.hasPayloads = hasPayloads; + } + + private final ByteArrayDataInput readerA = new ByteArrayDataInput(); + private final ByteArrayDataInput readerB = new ByteArrayDataInput(); + private final BytesRef scratchA = new BytesRef(); + private final BytesRef scratchB = new BytesRef(); + + @Override + public int compare(BytesRef a, BytesRef b) { + + // First by analyzed form: + readerA.reset(a.bytes, a.offset, a.length); + scratchA.length = readerA.readShort(); + scratchA.bytes = a.bytes; + scratchA.offset = readerA.getPosition(); + + readerB.reset(b.bytes, b.offset, b.length); + scratchB.bytes = b.bytes; + scratchB.length = readerB.readShort(); + scratchB.offset = readerB.getPosition(); + + int cmp = scratchA.compareTo(scratchB); + if (cmp != 0) { + return cmp; + } + readerA.skipBytes(scratchA.length); + readerB.skipBytes(scratchB.length); + // Next by cost: + long aCost = readerA.readInt(); + long bCost = readerB.readInt(); + if (aCost < bCost) { + return -1; + } else if (aCost > bCost) { + return 1; + } + + // Finally by surface form: + if (hasPayloads) { + scratchA.length = readerA.readShort(); + scratchA.offset = readerA.getPosition(); + scratchB.length = readerB.readShort(); + scratchB.offset = readerB.getPosition(); + } else { + scratchA.offset = readerA.getPosition(); + scratchA.length = a.length - scratchA.offset; + scratchB.offset = readerB.getPosition(); + scratchB.length = b.length - scratchB.offset; + } + return scratchA.compareTo(scratchB); + } + } + + /** Non-null if this sugggester created a temp dir, needed only during build */ + private static FSDirectory tmpBuildDir; + + @SuppressForbidden(reason = "access temp directory for building index") + protected static synchronized FSDirectory getTempDir() { + if (tmpBuildDir == null) { + // Lazy init + String tempDirPath = System.getProperty("java.io.tmpdir"); + if (tempDirPath == null) { + throw new RuntimeException("Java has no temporary folder property (java.io.tmpdir)?"); + } + try { + tmpBuildDir = FSDirectory.open(PathUtils.get(tempDirPath)); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + } + return tmpBuildDir; + } + + @Override + public void build(InputIterator iterator) throws IOException { + String prefix = getClass().getSimpleName(); + Directory tempDir = getTempDir(); + OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads)); + + IndexOutput tempInput = tempDir.createTempOutput(prefix, "input", IOContext.DEFAULT); + + OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); + OfflineSorter.ByteSequencesReader reader = null; + + hasPayloads = iterator.hasPayloads(); + + BytesRefBuilder scratch = new BytesRefBuilder(); + + TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); + String tempSortedFileName = null; + + count = 0; + byte buffer[] = new byte[8]; + try { + ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); + + for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null;) { + LimitedFiniteStringsIterator finiteStrings = + new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions); + for (IntsRef string; (string = finiteStrings.next()) != null; count++) { + Util.toBytesRef(string, scratch); + + // length of the analyzed text (FST input) + if (scratch.length() > Short.MAX_VALUE-2) { + throw new IllegalArgumentException( + "cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length() + ")"); + } + short analyzedLength = (short) scratch.length(); + + // compute the required length: + // analyzed sequence + weight (4) + surface + analyzedLength (short) + int requiredLength = analyzedLength + 4 + surfaceForm.length + 2; + + BytesRef payload; + + if (hasPayloads) { + if (surfaceForm.length > (Short.MAX_VALUE-2)) { + throw new IllegalArgumentException( + "cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")"); + } + payload = iterator.payload(); + // payload + surfaceLength (short) + requiredLength += payload.length + 2; + } else { + payload = null; + } + + buffer = ArrayUtil.grow(buffer, requiredLength); + + output.reset(buffer); + + output.writeShort(analyzedLength); + + output.writeBytes(scratch.bytes(), 0, scratch.length()); + + output.writeInt(encodeWeight(iterator.weight())); + + if (hasPayloads) { + for(int i=0;i<surfaceForm.length;i++) { + if (surfaceForm.bytes[i] == payloadSep) { + throw new IllegalArgumentException( + "surface form cannot contain unit separator character U+001F; this character is reserved"); + } + } + output.writeShort((short) surfaceForm.length); + output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); + output.writeBytes(payload.bytes, payload.offset, payload.length); + } else { + output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); + } + + assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength; + + writer.write(buffer, 0, output.getPosition()); + } + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size()); + } + writer.close(); + + // Sort all input/output pairs (required by FST.Builder): + tempSortedFileName = sorter.sort(tempInput.getName()); + + // Free disk space: + tempDir.deleteFile(tempInput.getName()); + + reader = new OfflineSorter.ByteSequencesReader( + tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), prefix); + + PairOutputs<Long,BytesRef> outputs = new PairOutputs<>( + PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); + Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + + // Build FST: + BytesRefBuilder previousAnalyzed = null; + BytesRefBuilder analyzed = new BytesRefBuilder(); + BytesRef surface = new BytesRef(); + IntsRefBuilder scratchInts = new IntsRefBuilder(); + ByteArrayDataInput input = new ByteArrayDataInput(); + + // Used to remove duplicate surface forms (but we + // still index the hightest-weight one). We clear + // this when we see a new analyzed form, so it cannot + // grow unbounded (at most 256 entries): + Set<BytesRef> seenSurfaceForms = new HashSet<>(); + + int dedup = 0; + while (reader.read(scratch)) { + input.reset(scratch.bytes(), 0, scratch.length()); + short analyzedLength = input.readShort(); + analyzed.grow(analyzedLength+2); + input.readBytes(analyzed.bytes(), 0, analyzedLength); + analyzed.setLength(analyzedLength); + + long cost = input.readInt(); + + surface.bytes = scratch.bytes(); + if (hasPayloads) { + surface.length = input.readShort(); + surface.offset = input.getPosition(); + } else { + surface.offset = input.getPosition(); + surface.length = scratch.length() - surface.offset; + } + + if (previousAnalyzed == null) { + previousAnalyzed = new BytesRefBuilder(); + previousAnalyzed.copyBytes(analyzed); + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); + } else if (analyzed.get().equals(previousAnalyzed.get())) { + dedup++; + if (dedup >= maxSurfaceFormsPerAnalyzedForm) { + // More than maxSurfaceFormsPerAnalyzedForm + // dups: skip the rest: + continue; + } + if (seenSurfaceForms.contains(surface)) { + continue; + } + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); + } else { + dedup = 0; + previousAnalyzed.copyBytes(analyzed); + seenSurfaceForms.clear(); + seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); + } + + // TODO: I think we can avoid the extra 2 bytes when + // there is no dup (dedup==0), but we'd have to fix + // the exactFirst logic ... which would be sort of + // hairy because we'd need to special case the two + // (dup/not dup)... + + // NOTE: must be byte 0 so we sort before whatever + // is next + analyzed.append((byte) 0); + analyzed.append((byte) dedup); + + Util.toIntsRef(analyzed.get(), scratchInts); + //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); + if (!hasPayloads) { + builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface))); + } else { + int payloadOffset = input.getPosition() + surface.length; + int payloadLength = scratch.length() - payloadOffset; + BytesRef br = new BytesRef(surface.length + 1 + payloadLength); + System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length); + br.bytes[surface.length] = (byte) payloadSep; + System.arraycopy(scratch.bytes(), payloadOffset, br.bytes, surface.length+1, payloadLength); + br.length = br.bytes.length; + builder.add(scratchInts.get(), outputs.newPair(cost, br)); + } + } + fst = builder.finish(); + + //PrintWriter pw = new PrintWriter("/tmp/out.dot"); + //Util.toDot(fst, pw, true, true); + //pw.close(); + + } finally { + IOUtils.closeWhileHandlingException(reader, writer); + IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName); + } + } + + @Override + public boolean store(OutputStream output) throws IOException { + DataOutput dataOut = new OutputStreamDataOutput(output); + try { + if (fst == null) { + return false; + } + + fst.save(dataOut); + dataOut.writeVInt(maxAnalyzedPathsForOneInput); + dataOut.writeByte((byte) (hasPayloads ? 1 : 0)); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public long getCount() { + return count; + } + + @Override + public boolean load(InputStream input) throws IOException { + DataInput dataIn = new InputStreamDataInput(input); + try { + this.fst = new FST<>(dataIn, new PairOutputs<>( + PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); + maxAnalyzedPathsForOneInput = dataIn.readVInt(); + hasPayloads = dataIn.readByte() == 1; + } finally { + IOUtils.close(input); + } + return true; + } + + private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) { + LookupResult result; + if (hasPayloads) { + int sepIndex = -1; + for(int i=0;i<output2.length;i++) { + if (output2.bytes[output2.offset+i] == payloadSep) { + sepIndex = i; + break; + } + } + assert sepIndex != -1; + final int payloadLen = output2.length - sepIndex - 1; + spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex); + BytesRef payload = new BytesRef(payloadLen); + System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen); + payload.length = payloadLen; + result = new LookupResult(spare.toString(), decodeWeight(output1), payload); + } else { + spare.copyUTF8Bytes(output2); + result = new LookupResult(spare.toString(), decodeWeight(output1)); + } + + return result; + } + + private boolean sameSurfaceForm(BytesRef key, BytesRef output2) { + if (hasPayloads) { + // output2 has at least PAYLOAD_SEP byte: + if (key.length >= output2.length) { + return false; + } + for(int i=0;i<key.length;i++) { + if (key.bytes[key.offset+i] != output2.bytes[output2.offset+i]) { + return false; + } + } + return output2.bytes[output2.offset + key.length] == payloadSep; + } else { + return key.bytesEquals(output2); + } + } + + @Override + public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) { + assert num > 0; + + if (onlyMorePopular) { + throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false"); + } + if (fst == null) { + return Collections.emptyList(); + } + + //System.out.println("lookup key=" + key + " num=" + num); + for (int i = 0; i < key.length(); i++) { + if (key.charAt(i) == holeCharacter) { + throw new IllegalArgumentException( + "lookup key cannot contain HOLE character U+001E; this character is reserved"); + } + if (key.charAt(i) == sepLabel) { + throw new IllegalArgumentException( + "lookup key cannot contain unit separator character U+001F; this character is reserved"); + } + } + final BytesRef utf8Key = new BytesRef(key); + try { + + Automaton lookupAutomaton = toLookupAutomaton(key); + + final CharsRefBuilder spare = new CharsRefBuilder(); + + //System.out.println(" now intersect exactFirst=" + exactFirst); + + // Intersect automaton w/ suggest wFST and get all + // prefix starting nodes & their outputs: + //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); + + //System.out.println(" prefixPaths: " + prefixPaths.size()); + + BytesReader bytesReader = fst.getBytesReader(); + + FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<>(); + + final List<LookupResult> results = new ArrayList<>(); + + List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst); + + if (exactFirst) { + + int count = 0; + for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { + if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) { + // This node has END_BYTE arc leaving, meaning it's an + // "exact" match: + count++; + } + } + + // Searcher just to find the single exact only + // match, if present: + Util.TopNSearcher<Pair<Long,BytesRef>> searcher; + searcher = new Util.TopNSearcher<>( + fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); + + // NOTE: we could almost get away with only using + // the first start node. The only catch is if + // maxSurfaceFormsPerAnalyzedForm had kicked in and + // pruned our exact match from one of these nodes + // ...: + for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { + if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) { + // This node has END_BYTE arc leaving, meaning it's an + // "exact" match: + searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input); + } + } + + Util.TopResults<Pair<Long,BytesRef>> completions = searcher.search(); + + // NOTE: this is rather inefficient: we enumerate + // every matching "exactly the same analyzed form" + // path, and then do linear scan to see if one of + // these exactly matches the input. It should be + // possible (though hairy) to do something similar + // to getByOutput, since the surface form is encoded + // into the FST output, so we more efficiently hone + // in on the exact surface-form match. Still, I + // suspect very little time is spent in this linear + // seach: it's bounded by how many prefix start + // nodes we have and the + // maxSurfaceFormsPerAnalyzedForm: + for(Result<Pair<Long,BytesRef>> completion : completions) { + BytesRef output2 = completion.output.output2; + if (sameSurfaceForm(utf8Key, output2)) { + results.add(getLookupResult(completion.output.output1, output2, spare)); + break; + } + } + + if (results.size() == num) { + // That was quick: + return results; + } + } + + Util.TopNSearcher<Pair<Long,BytesRef>> searcher; + searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, + num - results.size(), + num * maxAnalyzedPathsForOneInput, + weightComparator) { + private final Set<BytesRef> seen = new HashSet<>(); + + @Override + protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) { + + // Dedup: when the input analyzes to a graph we + // can get duplicate surface forms: + if (seen.contains(output.output2)) { + return false; + } + seen.add(output.output2); + + if (!exactFirst) { + return true; + } else { + // In exactFirst mode, don't accept any paths + // matching the surface form since that will + // create duplicate results: + if (sameSurfaceForm(utf8Key, output.output2)) { + // We found exact match, which means we should + // have already found it in the first search: + assert results.size() == 1; + return false; + } else { + return true; + } + } + } + }; + + prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst); + + for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { + searcher.addStartPaths(path.fstNode, path.output, true, path.input); + } + + TopResults<Pair<Long,BytesRef>> completions = searcher.search(); + + for(Result<Pair<Long,BytesRef>> completion : completions) { + + LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare); + + // TODO: for fuzzy case would be nice to return + // how many edits were required + + //System.out.println(" result=" + result); + results.add(result); + + if (results.size() == num) { + // In the exactFirst=true case the search may + // produce one extra path + break; + } + } + + return results; + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } + } + + @Override + public boolean store(DataOutput output) throws IOException { + output.writeVLong(count); + if (fst == null) { + return false; + } + + fst.save(output); + output.writeVInt(maxAnalyzedPathsForOneInput); + output.writeByte((byte) (hasPayloads ? 1 : 0)); + return true; + } + + @Override + public boolean load(DataInput input) throws IOException { + count = input.readVLong(); + this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); + maxAnalyzedPathsForOneInput = input.readVInt(); + hasPayloads = input.readByte() == 1; + return true; + } + + /** Returns all completion paths to initialize the search. */ + protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths, + Automaton lookupAutomaton, + FST<Pair<Long,BytesRef>> fst) + throws IOException { + return prefixPaths; + } + + final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { + try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) { + return toAutomaton(ts, ts2a); + } + } + + final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException { + // Create corresponding automaton: labels are bytes + // from each analyzed token, with byte 0 used as + // separator between tokens: + Automaton automaton = ts2a.toAutomaton(ts); + + automaton = replaceSep(automaton); + automaton = convertAutomaton(automaton); + + // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings + // assert SpecialOperations.isFinite(automaton); + + // Get all paths from the automaton (there can be + // more than one path, eg if the analyzer created a + // graph using SynFilter or WDF): + + return automaton; + } + + // EDIT: Adrien, needed by lookup providers + // NOTE: these XForks are unmaintainable, we need to get rid of them... + public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { + final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); + Automaton automaton; + try (TokenStream ts = stream) { + automaton = toAutomaton(ts, ts2a); + } + LimitedFiniteStringsIterator finiteStrings = + new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); + Set<IntsRef> set = new HashSet<>(); + for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) { + set.add(IntsRef.deepCopyOf(string)); + } + return Collections.unmodifiableSet(set); + } + + final Automaton toLookupAutomaton(final CharSequence key) throws IOException { + // TODO: is there a Reader from a CharSequence? + // Turn tokenstream into automaton: + Automaton automaton = null; + + try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) { + automaton = getTokenStreamToAutomaton().toAutomaton(ts); + } + + automaton = replaceSep(automaton); + + // TODO: we can optimize this somewhat by determinizing + // while we convert + + // This automaton should not blow up during determinize: + automaton = Operations.determinize(automaton, Integer.MAX_VALUE); + return automaton; + } + + + + /** + * Returns the weight associated with an input string, or null if it does not exist. + * + * Unsupported in this implementation (and will throw an {@link UnsupportedOperationException}). + * + * @param key input string + * @return the weight associated with the input string, or {@code null} if it does not exist. + */ + public Object get(CharSequence key) { + throw new UnsupportedOperationException(); + } + + /** + * cost -> weight + * + * @param encoded Cost + * @return Weight + */ + public static int decodeWeight(long encoded) { + return (int)(Integer.MAX_VALUE - encoded); + } + + /** + * weight -> cost + * + * @param value Weight + * @return Cost + */ + public static int encodeWeight(long value) { + if (value < 0 || value > Integer.MAX_VALUE) { + throw new UnsupportedOperationException("cannot encode value: " + value); + } + return Integer.MAX_VALUE - (int)value; + } + + static final Comparator<Pair<Long,BytesRef>> weightComparator = new Comparator<Pair<Long,BytesRef>> () { + @Override + public int compare(Pair<Long,BytesRef> left, Pair<Long,BytesRef> right) { + return left.output1.compareTo(right.output1); + } + }; + + + public static class XBuilder { + private Builder<Pair<Long, BytesRef>> builder; + private int maxSurfaceFormsPerAnalyzedForm; + private IntsRefBuilder scratchInts = new IntsRefBuilder(); + private final PairOutputs<Long, BytesRef> outputs; + private boolean hasPayloads; + private BytesRefBuilder analyzed = new BytesRefBuilder(); + private final SurfaceFormAndPayload[] surfaceFormsAndPayload; + private int count; + private ObjectIntHashMap<BytesRef> seenSurfaceForms = HppcMaps.Object.Integer.ensureNoNullKeys(256, 0.75f); + private int payloadSep; + + public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) { + this.payloadSep = payloadSep; + this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); + this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + this.hasPayloads = hasPayloads; + surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm]; + + } + public void startTerm(BytesRef analyzed) { + this.analyzed.grow(analyzed.length+2); + this.analyzed.copyBytes(analyzed); + } + + private final static class SurfaceFormAndPayload implements Comparable<SurfaceFormAndPayload> { + BytesRef payload; + long weight; + + public SurfaceFormAndPayload(BytesRef payload, long cost) { + super(); + this.payload = payload; + this.weight = cost; + } + + @Override + public int compareTo(SurfaceFormAndPayload o) { + int res = compare(weight, o.weight); + if (res == 0 ){ + return payload.compareTo(o.payload); + } + return res; + } + public static int compare(long x, long y) { + return (x < y) ? -1 : ((x == y) ? 0 : 1); + } + } + + public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException { + int surfaceIndex = -1; + long encodedWeight = cost == -1 ? cost : encodeWeight(cost); + /* + * we need to check if we have seen this surface form, if so only use the + * the surface form with the highest weight and drop the rest no matter if + * the payload differs. + */ + if (count >= maxSurfaceFormsPerAnalyzedForm) { + // More than maxSurfaceFormsPerAnalyzedForm + // dups: skip the rest: + return; + } + + BytesRef surfaceCopy; + final int keySlot; + if (count > 0 && (keySlot = seenSurfaceForms.indexOf(surface)) >= 0) { + surfaceIndex = seenSurfaceForms.indexGet(keySlot); + SurfaceFormAndPayload surfaceFormAndPayload = surfaceFormsAndPayload[surfaceIndex]; + if (encodedWeight >= surfaceFormAndPayload.weight) { + return; + } + surfaceCopy = BytesRef.deepCopyOf(surface); + } else { + surfaceIndex = count++; + surfaceCopy = BytesRef.deepCopyOf(surface); + seenSurfaceForms.put(surfaceCopy, surfaceIndex); + } + + BytesRef payloadRef; + if (!hasPayloads) { + payloadRef = surfaceCopy; + } else { + int len = surface.length + 1 + payload.length; + final BytesRef br = new BytesRef(len); + System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length); + br.bytes[surface.length] = (byte) payloadSep; + System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length); + br.length = len; + payloadRef = br; + } + if (surfaceFormsAndPayload[surfaceIndex] == null) { + surfaceFormsAndPayload[surfaceIndex] = new SurfaceFormAndPayload(payloadRef, encodedWeight); + } else { + surfaceFormsAndPayload[surfaceIndex].payload = payloadRef; + surfaceFormsAndPayload[surfaceIndex].weight = encodedWeight; + } + } + + public void finishTerm(long defaultWeight) throws IOException { + ArrayUtil.timSort(surfaceFormsAndPayload, 0, count); + int deduplicator = 0; + analyzed.append((byte) 0); + analyzed.setLength(analyzed.length() + 1); + analyzed.grow(analyzed.length()); + for (int i = 0; i < count; i++) { + analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++); + Util.toIntsRef(analyzed.get(), scratchInts); + SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i]; + long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight; + builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload)); + } + seenSurfaceForms.clear(); + count = 0; + } + + public FST<Pair<Long, BytesRef>> build() throws IOException { + return builder.finish(); + } + + public boolean hasPayloads() { + return hasPayloads; + } + + public int maxSurfaceFormsPerAnalyzedForm() { + return maxSurfaceFormsPerAnalyzedForm; + } + + } +} diff --git a/core/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java b/core/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java new file mode 100644 index 0000000000..31a04b6fa6 --- /dev/null +++ b/core/src/main/java/org/apache/lucene/search/suggest/analyzing/XFuzzySuggester.java @@ -0,0 +1,267 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.search.suggest.analyzing; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStreamToAutomaton; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.FiniteStringsIterator; +import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.UTF32ToUTF8; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PairOutputs; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES; + +/** + * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is + * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though + * you can explicitly choose classic Levenshtein by passing <code>false</code> + * for the <code>transpositions</code> parameter. + * <p> + * At most, this query will match terms up to + * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} + * edits. Higher distances are not supported. Note that the + * fuzzy distance is measured in "byte space" on the bytes + * returned by the {@link org.apache.lucene.analysis.TokenStream}'s {@link + * org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute}, usually UTF8. By default + * the analyzed bytes must be at least 3 {@link + * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are + * considered. Furthermore, the first 1 {@link + * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be + * edited. We allow up to 1 (@link + * #DEFAULT_MAX_EDITS} edit. + * If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits, + * minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code + * points (actual letters) instead of bytes.* + * + * <p> + * NOTE: This suggester does not boost suggestions that + * required no edits over suggestions that did require + * edits. This is a known limitation. + * + * <p> + * Note: complex query analyzers can have a significant impact on the lookup + * performance. It's recommended to not use analyzers that drop or inject terms + * like synonyms to keep the complexity of the prefix intersection low for good + * lookup performance. At index time, complex analyzers can safely be used. + * </p> + */ +public final class XFuzzySuggester extends XAnalyzingSuggester { + private final int maxEdits; + private final boolean transpositions; + private final int nonFuzzyPrefix; + private final int minFuzzyLength; + private final boolean unicodeAware; + + /** + * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix + * parameters in Unicode code points (actual letters) + * instead of bytes. + */ + public static final boolean DEFAULT_UNICODE_AWARE = false; + + /** + * The default minimum length of the key passed to {@link + * #lookup} before any edits are allowed. + */ + public static final int DEFAULT_MIN_FUZZY_LENGTH = 3; + + /** + * The default prefix length where edits are not allowed. + */ + public static final int DEFAULT_NON_FUZZY_PREFIX = 1; + + /** + * The default maximum number of edits for fuzzy + * suggestions. + */ + public static final int DEFAULT_MAX_EDITS = 1; + + /** + * The default transposition value passed to {@link org.apache.lucene.util.automaton.LevenshteinAutomata} + */ + public static final boolean DEFAULT_TRANSPOSITIONS = true; + + /** + * Creates a {@link FuzzySuggester} instance initialized with default values. + * + * @param analyzer the analyzer used for this suggester + */ + public XFuzzySuggester(Analyzer analyzer) { + this(analyzer, analyzer); + } + + /** + * Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values. + * + * @param indexAnalyzer + * Analyzer that will be used for analyzing suggestions while building the index. + * @param queryAnalyzer + * Analyzer that will be used for analyzing query text during lookup + */ + public XFuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) { + this(indexAnalyzer, null, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, + DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, + DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, + null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER); + + } + + /** + * Creates a {@link FuzzySuggester} instance. + * + * @param indexAnalyzer Analyzer that will be used for + * analyzing suggestions while building the index. + * @param queryAnalyzer Analyzer that will be used for + * analyzing query text during lookup + * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} + * @param maxSurfaceFormsPerAnalyzedForm Maximum number of + * surface forms to keep for a single analyzed form. + * When there are too many surface forms we discard the + * lowest weighted ones. + * @param maxGraphExpansions Maximum number of graph paths + * to expand from the analyzed form. Set this to -1 for + * no limit. + * @param maxEdits must be >= 0 and <= {@link org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} . + * @param transpositions <code>true</code> if transpositions should be treated as a primitive + * edit operation. If this is false, comparisons will implement the classic + * Levenshtein algorithm. + * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX} + * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH}) + * @param sepLabel separation label + * @param payloadSep payload separator byte + * @param endByte end byte marker byte + */ + public XFuzzySuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, + int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, + int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, + boolean unicodeAware, FST<PairOutputs.Pair<Long, BytesRef>> fst, boolean hasPayloads, + int maxAnalyzedPathsForOneInput, int sepLabel, int payloadSep, int endByte, int holeCharacter) { + super(indexAnalyzer, queryPrefix, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, + true, fst, hasPayloads, maxAnalyzedPathsForOneInput, sepLabel, payloadSep, endByte, holeCharacter); + if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { + throw new IllegalArgumentException( + "maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + } + if (nonFuzzyPrefix < 0) { + throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")"); + } + if (minFuzzyLength < 0) { + throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")"); + } + + this.maxEdits = maxEdits; + this.transpositions = transpositions; + this.nonFuzzyPrefix = nonFuzzyPrefix; + this.minFuzzyLength = minFuzzyLength; + this.unicodeAware = unicodeAware; + } + + @Override + protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths( + List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, + FST<PairOutputs.Pair<Long,BytesRef>> fst) + throws IOException { + + // TODO: right now there's no penalty for fuzzy/edits, + // ie a completion whose prefix matched exactly what the + // user typed gets no boost over completions that + // required an edit, which get no boost over completions + // requiring two edits. I suspect a multiplicative + // factor is appropriate (eg, say a fuzzy match must be at + // least 2X better weight than the non-fuzzy match to + // "compete") ... in which case I think the wFST needs + // to be log weights or something ... + + Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + w.write(levA.toDot()); + w.close(); + System.out.println("Wrote LevA to out.dot"); + */ + return FSTUtil.intersectPrefixPaths(levA, fst); + } + + @Override + protected Automaton convertAutomaton(Automaton a) { + if (unicodeAware) { + // FLORIAN EDIT: get converted Automaton from superclass + Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a)); + // This automaton should not blow up during determinize: + utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE); + return utf8automaton; + } else { + return super.convertAutomaton(a); + } + } + + @Override + public TokenStreamToAutomaton getTokenStreamToAutomaton() { + final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton(); + tsta.setUnicodeArcs(unicodeAware); + return tsta; + } + + Automaton toLevenshteinAutomata(Automaton automaton) { + List<Automaton> subs = new ArrayList<>(); + FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton); + for (IntsRef string; (string = finiteStrings.next()) != null;) { + if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) { + subs.add(Automata.makeString(string.ints, string.offset, string.length)); + } else { + int ints[] = new int[string.length-nonFuzzyPrefix]; + System.arraycopy(string.ints, string.offset+nonFuzzyPrefix, ints, 0, ints.length); + // TODO: maybe add alphaMin to LevenshteinAutomata, + // and pass 1 instead of 0? We probably don't want + // to allow the trailing dedup bytes to be + // edited... but then 0 byte is "in general" allowed + // on input (but not in UTF8). + LevenshteinAutomata lev = new LevenshteinAutomata( + ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions); + subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix))); + } + } + + if (subs.isEmpty()) { + // automaton is empty, there is no accepted paths through it + return Automata.makeEmpty(); // matches nothing + } else if (subs.size() == 1) { + // no synonyms or anything: just a single path through the tokenstream + return subs.get(0); + } else { + // multiple paths: this is really scary! is it slow? + // maybe we should not do this and throw UOE? + Automaton a = Operations.union(subs); + // TODO: we could call toLevenshteinAutomata() before det? + // this only happens if you have multiple paths anyway (e.g. synonyms) + return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES); + } + } +} diff --git a/core/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java b/core/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java index a4977baa1f..f0fb05f099 100644 --- a/core/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java +++ b/core/src/main/java/org/elasticsearch/index/codec/PerFieldMappingPostingFormatCodec.java @@ -28,6 +28,7 @@ import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper2x; /** * {@link PerFieldMappingPostingFormatCodec This postings format} is the default @@ -54,11 +55,14 @@ public class PerFieldMappingPostingFormatCodec extends Lucene60Codec { @Override public PostingsFormat getPostingsFormatForField(String field) { - final MappedFieldType indexName = mapperService.fullName(field); - if (indexName == null) { + final MappedFieldType fieldType = mapperService.fullName(field); + if (fieldType == null) { logger.warn("no index mapper found for field: [{}] returning default postings format", field); - } else if (indexName instanceof CompletionFieldMapper.CompletionFieldType) { + } else if (fieldType instanceof CompletionFieldMapper.CompletionFieldType) { return CompletionFieldMapper.CompletionFieldType.postingsFormat(); + } else if (fieldType instanceof CompletionFieldMapper2x.CompletionFieldType) { + return ((CompletionFieldMapper2x.CompletionFieldType) fieldType).postingsFormat( + super.getPostingsFormatForField(field)); } return super.getPostingsFormatForField(field); } diff --git a/core/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java b/core/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java index c3c5a5cbcc..53162dc535 100644 --- a/core/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java +++ b/core/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java @@ -119,6 +119,9 @@ public class CompletionFieldMapper extends FieldMapper implements ArrayValueMapp @Override public Mapper.Builder<?, ?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException { + if (parserContext.indexVersionCreated().before(Version.V_5_0_0_alpha1)) { + return new CompletionFieldMapper2x.TypeParser().parse(name, node, parserContext); + } CompletionFieldMapper.Builder builder = new CompletionFieldMapper.Builder(name); NamedAnalyzer indexAnalyzer = null; NamedAnalyzer searchAnalyzer = null; diff --git a/core/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper2x.java b/core/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper2x.java new file mode 100644 index 0000000000..9b0306c687 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper2x.java @@ -0,0 +1,611 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.mapper.core; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.Version; +import org.elasticsearch.common.ParseField; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.XContentParser.NumberType; +import org.elasticsearch.common.xcontent.XContentParser.Token; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.Mapper; +import org.elasticsearch.index.mapper.MapperException; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider; +import org.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat; +import org.elasticsearch.search.suggest.completion2x.CompletionTokenStream; +import org.elasticsearch.search.suggest.completion2x.context.ContextBuilder; +import org.elasticsearch.search.suggest.completion2x.context.ContextMapping; +import org.elasticsearch.search.suggest.completion2x.context.ContextMapping.ContextConfig; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import static org.elasticsearch.index.mapper.core.TypeParsers.parseMultiField; + +/** + * + */ +public class CompletionFieldMapper2x extends FieldMapper { + + public static final String CONTENT_TYPE = "completion"; + + public static class Defaults { + public static final CompletionFieldType FIELD_TYPE = new CompletionFieldType(); + + static { + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.freeze(); + } + + public static final boolean DEFAULT_PRESERVE_SEPARATORS = true; + public static final boolean DEFAULT_POSITION_INCREMENTS = true; + public static final boolean DEFAULT_HAS_PAYLOADS = false; + public static final int DEFAULT_MAX_INPUT_LENGTH = 50; + } + + public static class Fields { + // Mapping field names + public static final String ANALYZER = "analyzer"; + public static final ParseField SEARCH_ANALYZER = new ParseField("search_analyzer"); + public static final ParseField PRESERVE_SEPARATORS = new ParseField("preserve_separators"); + public static final ParseField PRESERVE_POSITION_INCREMENTS = new ParseField("preserve_position_increments"); + public static final String PAYLOADS = "payloads"; + public static final String TYPE = "type"; + public static final ParseField MAX_INPUT_LENGTH = new ParseField("max_input_length", "max_input_len"); + // Content field names + public static final String CONTENT_FIELD_NAME_INPUT = "input"; + public static final String CONTENT_FIELD_NAME_OUTPUT = "output"; + public static final String CONTENT_FIELD_NAME_PAYLOAD = "payload"; + public static final String CONTENT_FIELD_NAME_WEIGHT = "weight"; + public static final String CONTEXT = "context"; + } + + public static final Set<String> ALLOWED_CONTENT_FIELD_NAMES; + static { + ALLOWED_CONTENT_FIELD_NAMES = new HashSet<>(); + ALLOWED_CONTENT_FIELD_NAMES.add(Fields.CONTENT_FIELD_NAME_INPUT); + ALLOWED_CONTENT_FIELD_NAMES.add(Fields.CONTENT_FIELD_NAME_OUTPUT); + ALLOWED_CONTENT_FIELD_NAMES.add(Fields.CONTENT_FIELD_NAME_PAYLOAD); + ALLOWED_CONTENT_FIELD_NAMES.add(Fields.CONTENT_FIELD_NAME_WEIGHT); + ALLOWED_CONTENT_FIELD_NAMES.add(Fields.CONTEXT); + } + + public static class Builder extends FieldMapper.Builder<Builder, CompletionFieldMapper2x> { + + private boolean preserveSeparators = Defaults.DEFAULT_PRESERVE_SEPARATORS; + private boolean payloads = Defaults.DEFAULT_HAS_PAYLOADS; + private boolean preservePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS; + private int maxInputLength = Defaults.DEFAULT_MAX_INPUT_LENGTH; + private SortedMap<String, ContextMapping> contextMapping = ContextMapping.EMPTY_MAPPING; + + public Builder(String name) { + super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); + builder = this; + } + + public Builder payloads(boolean payloads) { + this.payloads = payloads; + return this; + } + + public Builder preserveSeparators(boolean preserveSeparators) { + this.preserveSeparators = preserveSeparators; + return this; + } + + public Builder preservePositionIncrements(boolean preservePositionIncrements) { + this.preservePositionIncrements = preservePositionIncrements; + return this; + } + + public Builder maxInputLength(int maxInputLength) { + if (maxInputLength <= 0) { + throw new IllegalArgumentException( + Fields.MAX_INPUT_LENGTH.getPreferredName() + " must be > 0 but was [" + maxInputLength + "]"); + } + this.maxInputLength = maxInputLength; + return this; + } + + public Builder contextMapping(SortedMap<String, ContextMapping> contextMapping) { + this.contextMapping = contextMapping; + return this; + } + + @Override + public CompletionFieldMapper2x build(Mapper.BuilderContext context) { + setupFieldType(context); + CompletionFieldType completionFieldType = (CompletionFieldType) fieldType; + completionFieldType.setProvider( + new AnalyzingCompletionLookupProvider(preserveSeparators, preservePositionIncrements, payloads)); + completionFieldType.setContextMapping(contextMapping); + return new CompletionFieldMapper2x(name, fieldType, maxInputLength, context.indexSettings(), + multiFieldsBuilder.build(this, context), copyTo); + } + + } + + public static class TypeParser implements Mapper.TypeParser { + + @Override + public Mapper.Builder<?, ?> parse(String name, Map<String, Object> node, ParserContext parserContext) + throws MapperParsingException { + CompletionFieldMapper2x.Builder builder = new Builder(name); + NamedAnalyzer indexAnalyzer = null; + NamedAnalyzer searchAnalyzer = null; + for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext(); ) { + Map.Entry<String, Object> entry = iterator.next(); + String fieldName = entry.getKey(); + Object fieldNode = entry.getValue(); + if (fieldName.equals("type")) { + continue; + } + if (Fields.ANALYZER.equals(fieldName) || // index_analyzer is for backcompat, remove for v3.0 + fieldName.equals("index_analyzer") && parserContext.indexVersionCreated().before(Version.V_2_0_0_beta1)) { + + indexAnalyzer = getNamedAnalyzer(parserContext, fieldNode.toString()); + iterator.remove(); + } else if (parserContext.parseFieldMatcher().match(fieldName, Fields.SEARCH_ANALYZER)) { + searchAnalyzer = getNamedAnalyzer(parserContext, fieldNode.toString()); + iterator.remove(); + } else if (fieldName.equals(Fields.PAYLOADS)) { + builder.payloads(Boolean.parseBoolean(fieldNode.toString())); + iterator.remove(); + } else if (parserContext.parseFieldMatcher().match(fieldName, Fields.PRESERVE_SEPARATORS)) { + builder.preserveSeparators(Boolean.parseBoolean(fieldNode.toString())); + iterator.remove(); + } else if (parserContext.parseFieldMatcher().match(fieldName, Fields.PRESERVE_POSITION_INCREMENTS)) { + builder.preservePositionIncrements(Boolean.parseBoolean(fieldNode.toString())); + iterator.remove(); + } else if (parserContext.parseFieldMatcher().match(fieldName, Fields.MAX_INPUT_LENGTH)) { + builder.maxInputLength(Integer.parseInt(fieldNode.toString())); + iterator.remove(); + } else if (parseMultiField(builder, name, parserContext, fieldName, fieldNode)) { + iterator.remove(); + } else if (fieldName.equals(Fields.CONTEXT)) { + builder.contextMapping(ContextBuilder.loadMappings(fieldNode, parserContext.indexVersionCreated())); + iterator.remove(); + } + } + + if (indexAnalyzer == null) { + if (searchAnalyzer != null) { + throw new MapperParsingException( + "analyzer on completion field [" + name + "] must be set when search_analyzer is set"); + } + indexAnalyzer = searchAnalyzer = parserContext.analysisService().analyzer("simple"); + } else if (searchAnalyzer == null) { + searchAnalyzer = indexAnalyzer; + } + builder.indexAnalyzer(indexAnalyzer); + builder.searchAnalyzer(searchAnalyzer); + + return builder; + } + + private NamedAnalyzer getNamedAnalyzer(ParserContext parserContext, String name) { + NamedAnalyzer analyzer = parserContext.analysisService().analyzer(name); + if (analyzer == null) { + throw new IllegalArgumentException("Can't find default or mapped analyzer with name [" + name + "]"); + } + return analyzer; + } + } + + public static final class CompletionFieldType extends MappedFieldType { + private PostingsFormat postingsFormat; + private AnalyzingCompletionLookupProvider analyzingSuggestLookupProvider; + private SortedMap<String, ContextMapping> contextMapping = ContextMapping.EMPTY_MAPPING; + + public CompletionFieldType() { + } + + protected CompletionFieldType(CompletionFieldType ref) { + super(ref); + this.postingsFormat = ref.postingsFormat; + this.analyzingSuggestLookupProvider = ref.analyzingSuggestLookupProvider; + this.contextMapping = ref.contextMapping; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof CompletionFieldType)) return false; + if (!super.equals(o)) return false; + CompletionFieldType fieldType = (CompletionFieldType) o; + return analyzingSuggestLookupProvider.getPreserveSep() == fieldType.analyzingSuggestLookupProvider.getPreserveSep() + && analyzingSuggestLookupProvider.getPreservePositionsIncrements() == + fieldType.analyzingSuggestLookupProvider.getPreservePositionsIncrements() && + analyzingSuggestLookupProvider.hasPayloads() == fieldType.analyzingSuggestLookupProvider.hasPayloads() && + Objects.equals(getContextMapping(), fieldType.getContextMapping()); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), + analyzingSuggestLookupProvider.getPreserveSep(), + analyzingSuggestLookupProvider.getPreservePositionsIncrements(), + analyzingSuggestLookupProvider.hasPayloads(), + getContextMapping()); + } + + @Override + public CompletionFieldType clone() { + return new CompletionFieldType(this); + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public void checkCompatibility(MappedFieldType fieldType, List<String> conflicts, boolean strict) { + super.checkCompatibility(fieldType, conflicts, strict); + CompletionFieldType other = (CompletionFieldType) fieldType; + if (analyzingSuggestLookupProvider.hasPayloads() != other.analyzingSuggestLookupProvider.hasPayloads()) { + conflicts.add("mapper [" + name() + "] has different [payload] values"); + } + if (analyzingSuggestLookupProvider.getPreservePositionsIncrements() != + other.analyzingSuggestLookupProvider.getPreservePositionsIncrements()) { + conflicts.add("mapper [" + name() + "] has different [preserve_position_increments] values"); + } + if (analyzingSuggestLookupProvider.getPreserveSep() != other.analyzingSuggestLookupProvider.getPreserveSep()) { + conflicts.add("mapper [" + name() + "] has different [preserve_separators] values"); + } + if (!ContextMapping.mappingsAreEqual(getContextMapping(), other.getContextMapping())) { + conflicts.add("mapper [" + name() + "] has different [context_mapping] values"); + } + } + + public void setProvider(AnalyzingCompletionLookupProvider provider) { + checkIfFrozen(); + this.analyzingSuggestLookupProvider = provider; + } + + public synchronized PostingsFormat postingsFormat(PostingsFormat in) { + if (in instanceof Completion090PostingsFormat) { + throw new IllegalStateException("Double wrapping of " + Completion090PostingsFormat.class); + } + if (postingsFormat == null) { + postingsFormat = new Completion090PostingsFormat(in, analyzingSuggestLookupProvider); + } + return postingsFormat; + } + + public void setContextMapping(SortedMap<String, ContextMapping> contextMapping) { + checkIfFrozen(); + this.contextMapping = contextMapping; + } + + /** + * Get the context mapping associated with this completion field + */ + public SortedMap<String, ContextMapping> getContextMapping() { + return contextMapping; + } + + /** + * @return true if a context mapping has been defined + */ + public boolean requiresContext() { + return contextMapping.isEmpty() == false; + } + } + + private static final BytesRef EMPTY = new BytesRef(); + + private int maxInputLength; + + public CompletionFieldMapper2x(String simpleName, MappedFieldType fieldType, int maxInputLength, + Settings indexSettings, MultiFields multiFields, CopyTo copyTo) { + super(simpleName, fieldType, Defaults.FIELD_TYPE, indexSettings, multiFields, copyTo); + this.maxInputLength = maxInputLength; + } + + @Override + public CompletionFieldType fieldType() { + return (CompletionFieldType) super.fieldType(); + } + + /** + * Parses and indexes inputs + * Parsing: + * Acceptable format: + * "STRING" - interpreted as field value (input) + * "ARRAY" - each element can be one of "OBJECT" (see below) + * "OBJECT" - { "input": STRING|ARRAY, "weight": STRING|INT, "contexts": ARRAY|OBJECT } + */ + @Override + public Mapper parse(ParseContext context) throws IOException { + XContentParser parser = context.parser(); + XContentParser.Token token = parser.currentToken(); + if (token == XContentParser.Token.VALUE_NULL) { + throw new MapperParsingException("completion field [" + fieldType().name() + "] does not support null values"); + } + + String surfaceForm = null; + BytesRef payload = null; + long weight = -1; + List<String> inputs = new ArrayList<>(4); + + SortedMap<String, ContextConfig> contextConfig = null; + + if (token == XContentParser.Token.VALUE_STRING) { + inputs.add(parser.text()); + multiFields.parse(this, context); + } else { + String currentFieldName = null; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + if (!ALLOWED_CONTENT_FIELD_NAMES.contains(currentFieldName)) { + throw new IllegalArgumentException( + "Unknown field name[" + currentFieldName + "], must be one of " + ALLOWED_CONTENT_FIELD_NAMES); + } + } else if (Fields.CONTEXT.equals(currentFieldName)) { + SortedMap<String, ContextConfig> configs = new TreeMap<>(); + if (token == Token.START_OBJECT) { + while ((token = parser.nextToken()) != Token.END_OBJECT) { + String name = parser.currentName(); + ContextMapping mapping = fieldType().getContextMapping().get(name); + if (mapping == null) { + throw new ElasticsearchParseException("context [{}] is not defined", name); + } else { + token = parser.nextToken(); + configs.put(name, mapping.parseContext(context, parser)); + } + } + contextConfig = new TreeMap<>(); + for (ContextMapping mapping : fieldType().getContextMapping().values()) { + ContextConfig config = configs.get(mapping.name()); + contextConfig.put(mapping.name(), config == null ? mapping.defaultConfig() : config); + } + } else { + throw new ElasticsearchParseException("context must be an object"); + } + } else if (Fields.CONTENT_FIELD_NAME_PAYLOAD.equals(currentFieldName)) { + if (!isStoringPayloads()) { + throw new MapperException("Payloads disabled in mapping"); + } + if (token == XContentParser.Token.START_OBJECT) { + XContentBuilder payloadBuilder = + XContentFactory.contentBuilder(parser.contentType()).copyCurrentStructure(parser); + payload = payloadBuilder.bytes().toBytesRef(); + payloadBuilder.close(); + } else if (token.isValue()) { + payload = parser.utf8BytesOrNull(); + } else { + throw new MapperException("payload doesn't support type " + token); + } + } else if (token == XContentParser.Token.VALUE_STRING) { + if (Fields.CONTENT_FIELD_NAME_OUTPUT.equals(currentFieldName)) { + surfaceForm = parser.text(); + } + if (Fields.CONTENT_FIELD_NAME_INPUT.equals(currentFieldName)) { + inputs.add(parser.text()); + } + if (Fields.CONTENT_FIELD_NAME_WEIGHT.equals(currentFieldName)) { + Number weightValue; + try { + weightValue = Long.parseLong(parser.text()); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "Weight must be a string representing a numeric value, but was [" + parser.text() + "]"); + } + weight = weightValue.longValue(); // always parse a long to make sure we don't get overflow + checkWeight(weight); + } + } else if (token == XContentParser.Token.VALUE_NUMBER) { + if (Fields.CONTENT_FIELD_NAME_WEIGHT.equals(currentFieldName)) { + NumberType numberType = parser.numberType(); + if (NumberType.LONG != numberType && NumberType.INT != numberType) { + throw new IllegalArgumentException( + "Weight must be an integer, but was [" + parser.numberValue() + "]"); + } + weight = parser.longValue(); // always parse a long to make sure we don't get overflow + checkWeight(weight); + } + } else if (token == XContentParser.Token.START_ARRAY) { + if (Fields.CONTENT_FIELD_NAME_INPUT.equals(currentFieldName)) { + while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { + inputs.add(parser.text()); + } + } + } + } + } + + if (contextConfig == null) { + contextConfig = new TreeMap<>(); + for (ContextMapping mapping : fieldType().getContextMapping().values()) { + contextConfig.put(mapping.name(), mapping.defaultConfig()); + } + } + + final ContextMapping.Context ctx = new ContextMapping.Context(contextConfig, context.doc()); + + payload = payload == null ? EMPTY : payload; + if (surfaceForm == null) { // no surface form use the input + for (String input : inputs) { + if (input.length() == 0) { + continue; + } + BytesRef suggestPayload = fieldType().analyzingSuggestLookupProvider.buildPayload(new BytesRef( + input), weight, payload); + context.doc().add(getCompletionField(ctx, input, suggestPayload)); + } + } else { + BytesRef suggestPayload = fieldType().analyzingSuggestLookupProvider.buildPayload(new BytesRef( + surfaceForm), weight, payload); + for (String input : inputs) { + if (input.length() == 0) { + continue; + } + context.doc().add(getCompletionField(ctx, input, suggestPayload)); + } + } + return null; + } + + private void checkWeight(long weight) { + if (weight < 0 || weight > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Weight must be in the interval [0..2147483647], but was [" + weight + "]"); + } + } + + public Field getCompletionField(ContextMapping.Context ctx, String input, BytesRef payload) { + final String originalInput = input; + if (input.length() > maxInputLength) { + final int len = correctSubStringLen(input, Math.min(maxInputLength, input.length())); + input = input.substring(0, len); + } + for (int i = 0; i < input.length(); i++) { + if (isReservedChar(input.charAt(i))) { + throw new IllegalArgumentException("Illegal input [" + originalInput + "] UTF-16 codepoint [0x" + + Integer.toHexString(input.charAt(i)).toUpperCase(Locale.ROOT) + + "] at position " + i + " is a reserved character"); + } + } + return new SuggestField( + fieldType().name(), ctx, input, fieldType(), payload, fieldType().analyzingSuggestLookupProvider); + } + + public static int correctSubStringLen(String input, int len) { + if (Character.isHighSurrogate(input.charAt(len - 1))) { + assert input.length() >= len + 1 && Character.isLowSurrogate(input.charAt(len)); + return len + 1; + } + return len; + } + + public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException { + return fieldType().analyzingSuggestLookupProvider.buildPayload(surfaceForm, weight, payload); + } + + private static final class SuggestField extends Field { + private final BytesRef payload; + private final CompletionTokenStream.ToFiniteStrings toFiniteStrings; + private final ContextMapping.Context ctx; + + public SuggestField(String name, ContextMapping.Context ctx, + String value, MappedFieldType type, BytesRef payload, + CompletionTokenStream.ToFiniteStrings toFiniteStrings) { + super(name, value, type); + this.payload = payload; + this.toFiniteStrings = toFiniteStrings; + this.ctx = ctx; + } + + @Override + public TokenStream tokenStream(Analyzer analyzer, TokenStream previous) { + TokenStream ts = ctx.wrapTokenStream(super.tokenStream(analyzer, previous)); + return new CompletionTokenStream(ts, payload, toFiniteStrings); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(simpleName()) + .field(Fields.TYPE, CONTENT_TYPE); + + builder.field(Fields.ANALYZER, fieldType().indexAnalyzer().name()); + if (fieldType().indexAnalyzer().name().equals(fieldType().searchAnalyzer().name()) == false) { + builder.field(Fields.SEARCH_ANALYZER.getPreferredName(), fieldType().searchAnalyzer().name()); + } + builder.field(Fields.PAYLOADS, fieldType().analyzingSuggestLookupProvider.hasPayloads()); + builder.field(Fields.PRESERVE_SEPARATORS.getPreferredName(), + fieldType().analyzingSuggestLookupProvider.getPreserveSep()); + builder.field(Fields.PRESERVE_POSITION_INCREMENTS.getPreferredName(), + fieldType().analyzingSuggestLookupProvider.getPreservePositionsIncrements()); + builder.field(Fields.MAX_INPUT_LENGTH.getPreferredName(), this.maxInputLength); + multiFields.toXContent(builder, params); + + if (fieldType().requiresContext()) { + builder.startObject(Fields.CONTEXT); + for (ContextMapping mapping : fieldType().getContextMapping().values()) { + builder.value(mapping); + } + builder.endObject(); + } + + return builder.endObject(); + } + + @Override + protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException { + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + public boolean isStoringPayloads() { + return fieldType().analyzingSuggestLookupProvider.hasPayloads(); + } + + @Override + protected void doMerge(Mapper mergeWith, boolean updateAllTypes) { + super.doMerge(mergeWith, updateAllTypes); + CompletionFieldMapper2x fieldMergeWith = (CompletionFieldMapper2x) mergeWith; + this.maxInputLength = fieldMergeWith.maxInputLength; + } + + // this should be package private but our tests don't allow it. + public static boolean isReservedChar(char character) { + /* we use 0x001F as a SEP_LABEL in the suggester but we can use the UTF-16 representation since they + * are equivalent. We also don't need to convert the input character to UTF-8 here to check for + * the 0x00 end label since all multi-byte UTF-8 chars start with 0x10 binary so if the UTF-16 CP is == 0x00 + * it's the single byte UTF-8 CP */ + assert XAnalyzingSuggester.PAYLOAD_SEP == XAnalyzingSuggester.SEP_LABEL; // ensure they are the same! + switch (character) { + case XAnalyzingSuggester.END_BYTE: + case XAnalyzingSuggester.SEP_LABEL: + case XAnalyzingSuggester.HOLE_CHARACTER: + case ContextMapping.SEPARATOR: + return true; + default: + return false; + } + } +} diff --git a/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java index 979dfd1815..8739c26570 100644 --- a/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java +++ b/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java @@ -19,6 +19,7 @@ package org.elasticsearch.index.shard; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; @@ -105,6 +106,7 @@ import org.elasticsearch.indices.recovery.RecoveryFailedException; import org.elasticsearch.indices.recovery.RecoveryState; import org.elasticsearch.search.suggest.completion.CompletionFieldStats; import org.elasticsearch.search.suggest.completion.CompletionStats; +import org.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat; import org.elasticsearch.threadpool.ThreadPool; import java.io.IOException; @@ -678,6 +680,9 @@ public class IndexShard extends AbstractIndexShardComponent { CompletionStats completionStats = new CompletionStats(); try (final Engine.Searcher currentSearcher = acquireSearcher("completion_stats")) { completionStats.add(CompletionFieldStats.completionStats(currentSearcher.reader(), fields)); + Completion090PostingsFormat postingsFormat = ((Completion090PostingsFormat) + PostingsFormat.forName(Completion090PostingsFormat.CODEC_NAME)); + completionStats.add(postingsFormat.completionStats(currentSearcher.reader(), fields)); } return completionStats; } diff --git a/core/src/main/java/org/elasticsearch/search/suggest/Suggest.java b/core/src/main/java/org/elasticsearch/search/suggest/Suggest.java index 35f70abf2c..f8fbdaf969 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/Suggest.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/Suggest.java @@ -112,6 +112,9 @@ public class Suggest implements Iterable<Suggest.Suggestion<? extends Entry<? ex case CompletionSuggestion.TYPE: suggestion = new CompletionSuggestion(); break; + case org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.TYPE: + suggestion = new org.elasticsearch.search.suggest.completion2x.CompletionSuggestion(); + break; case PhraseSuggestion.TYPE: suggestion = new PhraseSuggestion(); break; @@ -173,6 +176,16 @@ public class Suggest implements Iterable<Suggest.Suggestion<? extends Entry<? ex List<Suggestion<? extends Entry<? extends Option>>> reduced = new ArrayList<>(groupedSuggestions.size()); for (java.util.Map.Entry<String, List<Suggestion>> unmergedResults : groupedSuggestions.entrySet()) { List<Suggestion> value = unmergedResults.getValue(); + Class<? extends Suggestion> suggestionClass = null; + for (Suggestion suggestion : value) { + if (suggestionClass == null) { + suggestionClass = suggestion.getClass(); + } else if (suggestionClass != suggestion.getClass()) { + throw new IllegalArgumentException( + "detected mixed suggestion results, due to querying on old and new completion suggester," + + " query on a single completion suggester version"); + } + } Suggestion reduce = value.get(0).reduce(value); reduce.trim(); reduced.add(reduce); diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java index 8bf35a34b2..16a2804b37 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggester.java @@ -18,8 +18,11 @@ */ package org.elasticsearch.search.suggest.completion; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.ReaderUtil; +import org.apache.lucene.index.Terms; import org.apache.lucene.search.BulkScorer; import org.apache.lucene.search.CollectionTerminatedException; import org.apache.lucene.search.IndexSearcher; @@ -29,7 +32,9 @@ import org.apache.lucene.search.suggest.document.CompletionQuery; import org.apache.lucene.search.suggest.document.TopSuggestDocs; import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector; import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.PriorityQueue; +import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.text.Text; import org.elasticsearch.index.fielddata.AtomicFieldData; @@ -42,10 +47,12 @@ import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.search.suggest.Suggest; import org.elasticsearch.search.suggest.Suggester; import org.elasticsearch.search.suggest.SuggestionBuilder; +import org.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; @@ -61,56 +68,119 @@ public class CompletionSuggester extends Suggester<CompletionSuggestionContext> @Override protected Suggest.Suggestion<? extends Suggest.Suggestion.Entry<? extends Suggest.Suggestion.Entry.Option>> innerExecute(String name, final CompletionSuggestionContext suggestionContext, final IndexSearcher searcher, CharsRefBuilder spare) throws IOException { - final CompletionFieldMapper.CompletionFieldType fieldType = suggestionContext.getFieldType(); - if (fieldType == null) { - throw new IllegalArgumentException("field [" + suggestionContext.getField() + "] is not a completion field"); - } - CompletionSuggestion completionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize()); - spare.copyUTF8Bytes(suggestionContext.getText()); - CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry(new Text(spare.toString()), 0, spare.length()); - completionSuggestion.addTerm(completionSuggestEntry); - TopSuggestDocsCollector collector = new TopDocumentsCollector(suggestionContext.getSize()); - suggest(searcher, suggestionContext.toQuery(), collector); - int numResult = 0; - List<LeafReaderContext> leaves = searcher.getIndexReader().leaves(); - for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) { - TopDocumentsCollector.SuggestDoc suggestDoc = (TopDocumentsCollector.SuggestDoc) suggestScoreDoc; - // collect contexts - Map<String, Set<CharSequence>> contexts = Collections.emptyMap(); - if (fieldType.hasContextMappings() && suggestDoc.getContexts().isEmpty() == false) { - contexts = fieldType.getContextMappings().getNamedContexts(suggestDoc.getContexts()); - } - // collect payloads - final Map<String, List<Object>> payload = new HashMap<>(0); - List<String> payloadFields = suggestionContext.getPayloadFields(); - if (payloadFields.isEmpty() == false) { - final int readerIndex = ReaderUtil.subIndex(suggestDoc.doc, leaves); - final LeafReaderContext subReaderContext = leaves.get(readerIndex); - final int subDocId = suggestDoc.doc - subReaderContext.docBase; - for (String field : payloadFields) { - MapperService mapperService = suggestionContext.getShardContext().getMapperService(); - MappedFieldType payloadFieldType = mapperService.fullName(field); - if (payloadFieldType != null) { - QueryShardContext shardContext = suggestionContext.getShardContext(); - final AtomicFieldData data = shardContext.getForField(payloadFieldType) + if (suggestionContext.getFieldType() != null) { + final CompletionFieldMapper.CompletionFieldType fieldType = suggestionContext.getFieldType(); + CompletionSuggestion completionSuggestion = new CompletionSuggestion(name, suggestionContext.getSize()); + spare.copyUTF8Bytes(suggestionContext.getText()); + CompletionSuggestion.Entry completionSuggestEntry = new CompletionSuggestion.Entry( + new Text(spare.toString()), 0, spare.length()); + completionSuggestion.addTerm(completionSuggestEntry); + TopSuggestDocsCollector collector = new TopDocumentsCollector(suggestionContext.getSize()); + suggest(searcher, suggestionContext.toQuery(), collector); + int numResult = 0; + List<LeafReaderContext> leaves = searcher.getIndexReader().leaves(); + for (TopSuggestDocs.SuggestScoreDoc suggestScoreDoc : collector.get().scoreLookupDocs()) { + TopDocumentsCollector.SuggestDoc suggestDoc = (TopDocumentsCollector.SuggestDoc) suggestScoreDoc; + // collect contexts + Map<String, Set<CharSequence>> contexts = Collections.emptyMap(); + if (fieldType.hasContextMappings() && suggestDoc.getContexts().isEmpty() == false) { + contexts = fieldType.getContextMappings().getNamedContexts(suggestDoc.getContexts()); + } + // collect payloads + final Map<String, List<Object>> payload = new HashMap<>(0); + List<String> payloadFields = suggestionContext.getPayloadFields(); + if (payloadFields.isEmpty() == false) { + final int readerIndex = ReaderUtil.subIndex(suggestDoc.doc, leaves); + final LeafReaderContext subReaderContext = leaves.get(readerIndex); + final int subDocId = suggestDoc.doc - subReaderContext.docBase; + for (String field : payloadFields) { + MapperService mapperService = suggestionContext.getShardContext().getMapperService(); + MappedFieldType payloadFieldType = mapperService.fullName(field); + if (payloadFieldType != null) { + QueryShardContext shardContext = suggestionContext.getShardContext(); + final AtomicFieldData data = shardContext.getForField(payloadFieldType) .load(subReaderContext); - final ScriptDocValues scriptValues = data.getScriptValues(); - scriptValues.setNextDocId(subDocId); - payload.put(field, new ArrayList<>(scriptValues.getValues())); - } else { - throw new IllegalArgumentException("payload field [" + field + "] does not exist"); + final ScriptDocValues scriptValues = data.getScriptValues(); + scriptValues.setNextDocId(subDocId); + payload.put(field, new ArrayList<>(scriptValues.getValues())); + } else { + throw new IllegalArgumentException("payload field [" + field + "] does not exist"); + } } } - } - if (numResult++ < suggestionContext.getSize()) { - CompletionSuggestion.Entry.Option option = new CompletionSuggestion.Entry.Option( + if (numResult++ < suggestionContext.getSize()) { + CompletionSuggestion.Entry.Option option = new CompletionSuggestion.Entry.Option( new Text(suggestDoc.key.toString()), suggestDoc.score, contexts, payload); - completionSuggestEntry.addOption(option); - } else { - break; + completionSuggestEntry.addOption(option); + } else { + break; + } } + return completionSuggestion; + } else if (suggestionContext.getFieldType2x() != null) { + final IndexReader indexReader = searcher.getIndexReader(); + org.elasticsearch.search.suggest.completion2x.CompletionSuggestion completionSuggestion = + new org.elasticsearch.search.suggest.completion2x.CompletionSuggestion(name, suggestionContext.getSize()); + spare.copyUTF8Bytes(suggestionContext.getText()); + + org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry completionSuggestEntry = + new org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry(new Text(spare.toString()), 0, spare.length()); + completionSuggestion.addTerm(completionSuggestEntry); + + String fieldName = suggestionContext.getField(); + Map<String, org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry.Option> results = + new HashMap<>(indexReader.leaves().size() * suggestionContext.getSize()); + for (LeafReaderContext atomicReaderContext : indexReader.leaves()) { + LeafReader atomicReader = atomicReaderContext.reader(); + Terms terms = atomicReader.fields().terms(fieldName); + if (terms instanceof Completion090PostingsFormat.CompletionTerms) { + final Completion090PostingsFormat.CompletionTerms lookupTerms = (Completion090PostingsFormat.CompletionTerms) terms; + final Lookup lookup = lookupTerms.getLookup(suggestionContext.getFieldType2x(), suggestionContext); + if (lookup == null) { + // we don't have a lookup for this segment.. this might be possible if a merge dropped all + // docs from the segment that had a value in this segment. + continue; + } + List<Lookup.LookupResult> lookupResults = lookup.lookup(spare.get(), false, suggestionContext.getSize()); + for (Lookup.LookupResult res : lookupResults) { + + final String key = res.key.toString(); + final float score = res.value; + final org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry.Option value = results.get(key); + if (value == null) { + final org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry.Option option = + new org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry.Option(new Text(key), score, + res.payload == null ? null : new BytesArray(res.payload)); + results.put(key, option); + } else if (value.getScore() < score) { + value.setScore(score); + value.setPayload(res.payload == null ? null : new BytesArray(res.payload)); + } + } + } + } + final List<org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry.Option> options = + new ArrayList<>(results.values()); + CollectionUtil.introSort(options, scoreComparator); + + int optionCount = Math.min(suggestionContext.getSize(), options.size()); + for (int i = 0; i < optionCount; i++) { + completionSuggestEntry.addOption(options.get(i)); + } + + return completionSuggestion; + } + return null; + } + + private static final ScoreComparator scoreComparator = new ScoreComparator(); + public static class ScoreComparator implements + Comparator<org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry.Option> { + @Override + public int compare(org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry.Option o1, + org.elasticsearch.search.suggest.completion2x.CompletionSuggestion.Entry.Option o2) { + return Float.compare(o2.getScore(), o1.getScore()); } - return completionSuggestion; } private static void suggest(IndexSearcher searcher, CompletionQuery query, TopSuggestDocsCollector collector) throws IOException { diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java index 2d9307a882..7810d03004 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionBuilder.java @@ -34,6 +34,7 @@ import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper2x; import org.elasticsearch.index.query.QueryParseContext; import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.search.suggest.SuggestUtils; @@ -41,6 +42,9 @@ import org.elasticsearch.search.suggest.SuggestionBuilder; import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext; import org.elasticsearch.search.suggest.completion.context.ContextMapping; import org.elasticsearch.search.suggest.completion.context.ContextMappings; +import org.elasticsearch.search.suggest.completion2x.context.CategoryContextMapping; +import org.elasticsearch.search.suggest.completion2x.context.ContextMapping.ContextQuery; +import org.elasticsearch.search.suggest.completion2x.context.GeolocationContextMapping; import java.io.IOException; import java.util.ArrayList; @@ -219,13 +223,116 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug contentBuilder.endArray(); } contentBuilder.endObject(); - contextBytes = contentBuilder.bytes(); - return this; + return contexts(contentBuilder); } catch (IOException e) { throw new IllegalArgumentException(e); } } + private CompletionSuggestionBuilder contexts(XContentBuilder contextBuilder) { + contextBytes = contextBuilder.bytes(); + return this; + } + + public CompletionSuggestionBuilder contexts(Contexts2x contexts2x) { + Objects.requireNonNull(contexts2x, "contexts must not be null"); + try { + XContentBuilder contentBuilder = XContentFactory.jsonBuilder(); + contentBuilder.startObject(); + for (ContextQuery contextQuery : contexts2x.contextQueries) { + contextQuery.toXContent(contentBuilder, EMPTY_PARAMS); + } + contentBuilder.endObject(); + return contexts(contentBuilder); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + + // for 2.x context support + public static class Contexts2x { + private List<ContextQuery> contextQueries = new ArrayList<>(); + + @SuppressWarnings("unchecked") + private Contexts2x addContextQuery(ContextQuery ctx) { + this.contextQueries.add(ctx); + return this; + } + + /** + * Setup a Geolocation for suggestions. See {@link GeolocationContextMapping}. + * @param lat Latitude of the location + * @param lon Longitude of the Location + * @return this + */ + @Deprecated + public Contexts2x addGeoLocation(String name, double lat, double lon, int ... precisions) { + return addContextQuery(GeolocationContextMapping.query(name, lat, lon, precisions)); + } + + /** + * Setup a Geolocation for suggestions. See {@link GeolocationContextMapping}. + * @param lat Latitude of the location + * @param lon Longitude of the Location + * @param precisions precisions as string var-args + * @return this + */ + @Deprecated + public Contexts2x addGeoLocationWithPrecision(String name, double lat, double lon, String ... precisions) { + return addContextQuery(GeolocationContextMapping.query(name, lat, lon, precisions)); + } + + /** + * Setup a Geolocation for suggestions. See {@link GeolocationContextMapping}. + * @param geohash Geohash of the location + * @return this + */ + @Deprecated + public Contexts2x addGeoLocation(String name, String geohash) { + return addContextQuery(GeolocationContextMapping.query(name, geohash)); + } + + /** + * Setup a Category for suggestions. See {@link CategoryContextMapping}. + * @param categories name of the category + * @return this + */ + @Deprecated + public Contexts2x addCategory(String name, CharSequence...categories) { + return addContextQuery(CategoryContextMapping.query(name, categories)); + } + + /** + * Setup a Category for suggestions. See {@link CategoryContextMapping}. + * @param categories name of the category + * @return this + */ + @Deprecated + public Contexts2x addCategory(String name, Iterable<? extends CharSequence> categories) { + return addContextQuery(CategoryContextMapping.query(name, categories)); + } + + /** + * Setup a Context Field for suggestions. See {@link CategoryContextMapping}. + * @param fieldvalues name of the category + * @return this + */ + @Deprecated + public Contexts2x addContextField(String name, CharSequence...fieldvalues) { + return addContextQuery(CategoryContextMapping.query(name, fieldvalues)); + } + + /** + * Setup a Context Field for suggestions. See {@link CategoryContextMapping}. + * @param fieldvalues name of the category + * @return this + */ + @Deprecated + public Contexts2x addContextField(String name, Iterable<? extends CharSequence> fieldvalues) { + return addContextQuery(CategoryContextMapping.query(name, fieldvalues)); + } + } + private static class InnerBuilder extends CompletionSuggestionBuilder { private String field; @@ -285,7 +392,12 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug suggestionContext.setFuzzyOptions(fuzzyOptions); suggestionContext.setRegexOptions(regexOptions); MappedFieldType mappedFieldType = mapperService.fullName(suggestionContext.getField()); - if (mappedFieldType != null && mappedFieldType instanceof CompletionFieldMapper.CompletionFieldType) { + if (mappedFieldType == null || + (mappedFieldType instanceof CompletionFieldMapper.CompletionFieldType == false + && mappedFieldType instanceof CompletionFieldMapper2x.CompletionFieldType == false)) { + throw new IllegalArgumentException("Field [" + suggestionContext.getField() + "] is not a completion suggest field"); + } + if (mappedFieldType instanceof CompletionFieldMapper.CompletionFieldType) { CompletionFieldMapper.CompletionFieldType type = (CompletionFieldMapper.CompletionFieldType) mappedFieldType; suggestionContext.setFieldType(type); if (type.hasContextMappings() && contextBytes != null) { @@ -310,9 +422,23 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug } else if (contextBytes != null) { throw new IllegalArgumentException("suggester [" + type.name() + "] doesn't expect any context"); } - } else { - throw new IllegalArgumentException("Field [" + suggestionContext.getField() + "] is not a completion suggest field"); + } else if (mappedFieldType instanceof CompletionFieldMapper2x.CompletionFieldType) { + CompletionFieldMapper2x.CompletionFieldType type = ((CompletionFieldMapper2x.CompletionFieldType) mappedFieldType); + suggestionContext.setFieldType2x(type); + if (type.requiresContext()) { + if (contextBytes != null) { + try (XContentParser contextParser = XContentFactory.xContent(contextBytes).createParser(contextBytes)) { + contextParser.nextToken(); + suggestionContext.setContextQueries(ContextQuery.parseQueries(type.getContextMapping(), contextParser)); + } + } else { + throw new IllegalArgumentException("suggester [completion] requires context to be setup"); + } + } else if (contextBytes != null) { + throw new IllegalArgumentException("suggester [completion] doesn't expect any context"); + } } + assert suggestionContext.getFieldType() != null || suggestionContext.getFieldType2x() != null : "no completion field type set"; return suggestionContext; } diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java index 1941bc9fb8..268e0553ff 100644 --- a/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion/CompletionSuggestionContext.java @@ -21,10 +21,12 @@ package org.elasticsearch.search.suggest.completion; import org.apache.lucene.search.suggest.document.CompletionQuery; import org.elasticsearch.common.unit.Fuzziness; import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper2x; import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.search.suggest.SuggestionSearchContext; import org.elasticsearch.search.suggest.completion.context.ContextMapping; import org.elasticsearch.search.suggest.completion.context.ContextMappings; +import org.elasticsearch.search.suggest.completion2x.context.ContextMapping.ContextQuery; import java.util.Collections; import java.util.List; @@ -44,11 +46,17 @@ public class CompletionSuggestionContext extends SuggestionSearchContext.Suggest private RegexOptions regexOptions; private Map<String, List<ContextMapping.InternalQueryContext>> queryContexts = Collections.emptyMap(); private List<String> payloadFields = Collections.emptyList(); + private CompletionFieldMapper2x.CompletionFieldType fieldType2x; + private List<ContextQuery> contextQueries; CompletionFieldMapper.CompletionFieldType getFieldType() { return this.fieldType; } + CompletionFieldMapper2x.CompletionFieldType getFieldType2x() { + return this.fieldType2x; + } + void setFieldType(CompletionFieldMapper.CompletionFieldType fieldType) { this.fieldType = fieldType; } @@ -116,4 +124,16 @@ public class CompletionSuggestionContext extends SuggestionSearchContext.Suggest } return query; } + + public void setFieldType2x(CompletionFieldMapper2x.CompletionFieldType type) { + this.fieldType2x = type; + } + + public void setContextQueries(List<ContextQuery> contextQueries) { + this.contextQueries = contextQueries; + } + + public List<ContextQuery> getContextQueries() { + return contextQueries; + } } diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion2x/AnalyzingCompletionLookupProvider.java b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/AnalyzingCompletionLookupProvider.java new file mode 100644 index 0000000000..48f0afc73c --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/AnalyzingCompletionLookupProvider.java @@ -0,0 +1,413 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest.completion2x; + +import com.carrotsearch.hppc.ObjectLongHashMap; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester; +import org.apache.lucene.search.suggest.analyzing.XFuzzySuggester; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.fst.ByteSequenceOutputs; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PairOutputs; +import org.apache.lucene.util.fst.PairOutputs.Pair; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.elasticsearch.common.regex.Regex; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper2x; +import org.elasticsearch.search.suggest.completion.CompletionStats; +import org.elasticsearch.search.suggest.completion.CompletionSuggestionContext; +import org.elasticsearch.search.suggest.completion.FuzzyOptions; +import org.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat.CompletionLookupProvider; +import org.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat.LookupFactory; +import org.elasticsearch.search.suggest.completion2x.context.ContextMapping.ContextQuery; + +import java.io.IOException; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider { + + // for serialization + public static final int SERIALIZE_PRESERVE_SEPARATORS = 1; + public static final int SERIALIZE_HAS_PAYLOADS = 2; + public static final int SERIALIZE_PRESERVE_POSITION_INCREMENTS = 4; + + private static final int MAX_SURFACE_FORMS_PER_ANALYZED_FORM = 256; + private static final int MAX_GRAPH_EXPANSIONS = -1; + + public static final String CODEC_NAME = "analyzing"; + public static final int CODEC_VERSION_START = 1; + public static final int CODEC_VERSION_SERIALIZED_LABELS = 2; + public static final int CODEC_VERSION_CHECKSUMS = 3; + public static final int CODEC_VERSION_LATEST = CODEC_VERSION_CHECKSUMS; + + private final boolean preserveSep; + private final boolean preservePositionIncrements; + private final int maxSurfaceFormsPerAnalyzedForm; + private final int maxGraphExpansions; + private final boolean hasPayloads; + private final XAnalyzingSuggester prototype; + + public AnalyzingCompletionLookupProvider(boolean preserveSep, boolean preservePositionIncrements, boolean hasPayloads) { + this.preserveSep = preserveSep; + this.preservePositionIncrements = preservePositionIncrements; + this.hasPayloads = hasPayloads; + this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM; + this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS; + int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0; + // needs to fixed in the suggester first before it can be supported + //options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0; + prototype = new XAnalyzingSuggester(null, null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, + preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, + XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER); + } + + @Override + public String getName() { + return "analyzing"; + } + + public boolean getPreserveSep() { + return preserveSep; + } + + public boolean getPreservePositionsIncrements() { + return preservePositionIncrements; + } + + public boolean hasPayloads() { + return hasPayloads; + } + + @Override + public FieldsConsumer consumer(final IndexOutput output) throws IOException { + CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); + return new FieldsConsumer() { + private Map<String, Long> fieldOffsets = new HashMap<>(); + + @Override + public void close() throws IOException { + try { + /* + * write the offsets per field such that we know where + * we need to load the FSTs from + */ + long pointer = output.getFilePointer(); + output.writeVInt(fieldOffsets.size()); + for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { + output.writeString(entry.getKey()); + output.writeVLong(entry.getValue()); + } + output.writeLong(pointer); + CodecUtil.writeFooter(output); + } finally { + IOUtils.close(output); + } + } + + @Override + public void write(Fields fields) throws IOException { + for(String field : fields) { + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + TermsEnum termsEnum = terms.iterator(); + PostingsEnum docsEnum = null; + final SuggestPayload spare = new SuggestPayload(); + int maxAnalyzedPathsForOneInput = 0; + final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( + maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); + int docCount = 0; + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS); + builder.startTerm(term); + int docFreq = 0; + while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + for (int i = 0; i < docsEnum.freq(); i++) { + final int position = docsEnum.nextPosition(); + AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare); + builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); + // multi fields have the same surface form so we sum up here + maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); + } + docFreq++; + docCount = Math.max(docCount, docsEnum.docID()+1); + } + builder.finishTerm(docFreq); + } + /* + * Here we are done processing the field and we can + * buid the FST and write it to disk. + */ + FST<Pair<Long, BytesRef>> build = builder.build(); + assert build != null || docCount == 0: "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; + /* + * it's possible that the FST is null if we have 2 segments that get merged + * and all docs that have a value in this field are deleted. This will cause + * a consumer to be created but it doesn't consume any values causing the FSTBuilder + * to return null. + */ + if (build != null) { + fieldOffsets.put(field, output.getFilePointer()); + build.save(output); + /* write some more meta-info */ + output.writeVInt(maxAnalyzedPathsForOneInput); + output.writeVInt(maxSurfaceFormsPerAnalyzedForm); + output.writeInt(maxGraphExpansions); // can be negative + int options = 0; + options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; + options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; + options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; + output.writeVInt(options); + output.writeVInt(XAnalyzingSuggester.SEP_LABEL); + output.writeVInt(XAnalyzingSuggester.END_BYTE); + output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); + output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); + } + } + } + }; + } + + + @Override + public LookupFactory load(IndexInput input) throws IOException { + long sizeInBytes = 0; + int version = CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION_START, CODEC_VERSION_LATEST); + if (version >= CODEC_VERSION_CHECKSUMS) { + CodecUtil.checksumEntireFile(input); + } + final long metaPointerPosition = input.length() - (version >= CODEC_VERSION_CHECKSUMS? 8 + CodecUtil.footerLength() : 8); + final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<>(); + input.seek(metaPointerPosition); + long metaPointer = input.readLong(); + input.seek(metaPointer); + int numFields = input.readVInt(); + + Map<Long, String> meta = new TreeMap<>(); + for (int i = 0; i < numFields; i++) { + String name = input.readString(); + long offset = input.readVLong(); + meta.put(offset, name); + } + + for (Map.Entry<Long, String> entry : meta.entrySet()) { + input.seek(entry.getKey()); + FST<Pair<Long, BytesRef>> fst = new FST<>(input, new PairOutputs<>( + PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); + int maxAnalyzedPathsForOneInput = input.readVInt(); + int maxSurfaceFormsPerAnalyzedForm = input.readVInt(); + int maxGraphExpansions = input.readInt(); + int options = input.readVInt(); + boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPARATORS) != 0; + boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0; + boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0; + + // first version did not include these three fields, so fall back to old default (before the analyzingsuggester + // was updated in Lucene, so we cannot use the suggester defaults) + int sepLabel, payloadSep, endByte, holeCharacter; + switch (version) { + case CODEC_VERSION_START: + sepLabel = 0xFF; + payloadSep = '\u001f'; + endByte = 0x0; + holeCharacter = '\u001E'; + break; + default: + sepLabel = input.readVInt(); + endByte = input.readVInt(); + payloadSep = input.readVInt(); + holeCharacter = input.readVInt(); + } + + AnalyzingSuggestHolder holder = new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, + maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput, + fst, sepLabel, payloadSep, endByte, holeCharacter); + sizeInBytes += fst.ramBytesUsed(); + lookupMap.put(entry.getValue(), holder); + } + final long ramBytesUsed = sizeInBytes; + return new LookupFactory() { + @Override + public Lookup getLookup(CompletionFieldMapper2x.CompletionFieldType fieldType, CompletionSuggestionContext suggestionContext) { + AnalyzingSuggestHolder analyzingSuggestHolder = lookupMap.get(fieldType.name()); + if (analyzingSuggestHolder == null) { + return null; + } + int flags = analyzingSuggestHolder.getPreserveSeparator() ? XAnalyzingSuggester.PRESERVE_SEP : 0; + + final XAnalyzingSuggester suggester; + final Automaton queryPrefix = fieldType.requiresContext() ? + ContextQuery.toAutomaton(analyzingSuggestHolder.getPreserveSeparator(), suggestionContext.getContextQueries()) : null; + + final FuzzyOptions fuzzyOptions = suggestionContext.getFuzzyOptions(); + if (fuzzyOptions != null) { + suggester = new XFuzzySuggester(fieldType.indexAnalyzer(), queryPrefix, fieldType.searchAnalyzer(), flags, + analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions, + fuzzyOptions.getEditDistance(), fuzzyOptions.isTranspositions(), + fuzzyOptions.getFuzzyPrefixLength(), fuzzyOptions.getFuzzyMinLength(), fuzzyOptions.isUnicodeAware(), + analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads, + analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, + analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte, + analyzingSuggestHolder.holeCharacter); + } else { + suggester = new XAnalyzingSuggester(fieldType.indexAnalyzer(), queryPrefix, fieldType.searchAnalyzer(), flags, + analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions, + analyzingSuggestHolder.preservePositionIncrements, analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads, + analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, + analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte, analyzingSuggestHolder.holeCharacter); + } + return suggester; + } + + @Override + public CompletionStats stats(String... fields) { + long sizeInBytes = 0; + ObjectLongHashMap<String> completionFields = null; + if (fields != null && fields.length > 0) { + completionFields = new ObjectLongHashMap<>(fields.length); + } + + for (Map.Entry<String, AnalyzingSuggestHolder> entry : lookupMap.entrySet()) { + sizeInBytes += entry.getValue().fst.ramBytesUsed(); + if (fields == null || fields.length == 0) { + continue; + } + if (Regex.simpleMatch(fields, entry.getKey())) { + long fstSize = entry.getValue().fst.ramBytesUsed(); + completionFields.addTo(entry.getKey(), fstSize); + } + } + + return new CompletionStats(sizeInBytes, completionFields); + } + + @Override + AnalyzingSuggestHolder getAnalyzingSuggestHolder(MappedFieldType fieldType) { + return lookupMap.get(fieldType.name()); + } + + @Override + public long ramBytesUsed() { + return ramBytesUsed; + } + + @Override + public Collection<Accountable> getChildResources() { + return Accountables.namedAccountables("field", lookupMap); + } + }; + } + + static class AnalyzingSuggestHolder implements Accountable { + final boolean preserveSep; + final boolean preservePositionIncrements; + final int maxSurfaceFormsPerAnalyzedForm; + final int maxGraphExpansions; + final boolean hasPayloads; + final int maxAnalyzedPathsForOneInput; + final FST<Pair<Long, BytesRef>> fst; + final int sepLabel; + final int payloadSep; + final int endByte; + final int holeCharacter; + + public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, + int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, + boolean hasPayloads, int maxAnalyzedPathsForOneInput, + FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, + int endByte, int holeCharacter) { + this.preserveSep = preserveSep; + this.preservePositionIncrements = preservePositionIncrements; + this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; + this.maxGraphExpansions = maxGraphExpansions; + this.hasPayloads = hasPayloads; + this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput; + this.fst = fst; + this.sepLabel = sepLabel; + this.payloadSep = payloadSep; + this.endByte = endByte; + this.holeCharacter = holeCharacter; + } + + public boolean getPreserveSeparator() { + return preserveSep; + } + + public boolean getPreservePositionIncrements() { + return preservePositionIncrements; + } + + public boolean hasPayloads() { + return hasPayloads; + } + + @Override + public long ramBytesUsed() { + if (fst != null) { + return fst.ramBytesUsed(); + } else { + return 0; + } + } + + @Override + public Collection<Accountable> getChildResources() { + if (fst != null) { + return Collections.singleton(Accountables.namedAccountable("fst", fst)); + } else { + return Collections.emptyList(); + } + } + } + + @Override + public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { + return prototype.toFiniteStrings(stream); + } + + +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion2x/Completion090PostingsFormat.java b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/Completion090PostingsFormat.java new file mode 100644 index 0000000000..8e9e51597d --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/Completion090PostingsFormat.java @@ -0,0 +1,353 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion2x; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FilterLeafReader.FilterTerms; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.store.IOContext.Context; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.Loggers; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper2x; +import org.elasticsearch.search.suggest.completion.CompletionStats; +import org.elasticsearch.search.suggest.completion.CompletionSuggestionContext; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * This {@link PostingsFormat} is basically a T-Sink for a default postings + * format that is used to store postings on disk fitting the lucene APIs and + * builds a suggest FST as an auxiliary data structure next to the actual + * postings format. It uses the delegate postings format for simplicity to + * handle all the merge operations. The auxiliary suggest FST data structure is + * only loaded if a FieldsProducer is requested for reading, for merging it uses + * the low memory delegate postings format. + */ +public class Completion090PostingsFormat extends PostingsFormat { + + public static final String CODEC_NAME = "completion090"; + public static final int SUGGEST_CODEC_VERSION = 1; + public static final int SUGGEST_VERSION_CURRENT = SUGGEST_CODEC_VERSION; + public static final String EXTENSION = "cmp"; + + private final static ESLogger logger = Loggers.getLogger(Completion090PostingsFormat.class); + private PostingsFormat delegatePostingsFormat; + private final static Map<String, CompletionLookupProvider> providers; + private CompletionLookupProvider writeProvider; + + + static { + final CompletionLookupProvider provider = new AnalyzingCompletionLookupProvider(true, true, false); + providers = Collections.singletonMap(provider.getName(), provider); + } + + public Completion090PostingsFormat(PostingsFormat delegatePostingsFormat, CompletionLookupProvider provider) { + super(CODEC_NAME); + this.delegatePostingsFormat = delegatePostingsFormat; + this.writeProvider = provider; + assert delegatePostingsFormat != null && writeProvider != null; + } + + /* + * Used only by core Lucene at read-time via Service Provider instantiation + * do not use at Write-time in application code. + */ + public Completion090PostingsFormat() { + super(CODEC_NAME); + } + + @Override + public CompletionFieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + if (delegatePostingsFormat == null) { + throw new UnsupportedOperationException("Error - " + getClass().getName() + + " has been constructed without a choice of PostingsFormat"); + } + assert writeProvider != null; + return new CompletionFieldsConsumer(state); + } + + @Override + public CompletionFieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + return new CompletionFieldsProducer(state); + } + + private class CompletionFieldsConsumer extends FieldsConsumer { + + private FieldsConsumer delegatesFieldsConsumer; + private FieldsConsumer suggestFieldsConsumer; + + public CompletionFieldsConsumer(SegmentWriteState state) throws IOException { + this.delegatesFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state); + String suggestFSTFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION); + IndexOutput output = null; + boolean success = false; + try { + output = state.directory.createOutput(suggestFSTFile, state.context); + CodecUtil.writeHeader(output, CODEC_NAME, SUGGEST_VERSION_CURRENT); + /* + * we write the delegate postings format name so we can load it + * without getting an instance in the ctor + */ + output.writeString(delegatePostingsFormat.getName()); + output.writeString(writeProvider.getName()); + this.suggestFieldsConsumer = writeProvider.consumer(output); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(output); + } + } + } + + @Override + public void write(Fields fields) throws IOException { + delegatesFieldsConsumer.write(fields); + suggestFieldsConsumer.write(fields); + } + + @Override + public void close() throws IOException { + IOUtils.close(delegatesFieldsConsumer, suggestFieldsConsumer); + } + } + + private static class CompletionFieldsProducer extends FieldsProducer { + // TODO make this class lazyload all the things in order to take advantage of the new merge instance API + // today we just load everything up-front + private final FieldsProducer delegateProducer; + private final LookupFactory lookupFactory; + private final int version; + + public CompletionFieldsProducer(SegmentReadState state) throws IOException { + String suggestFSTFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, EXTENSION); + IndexInput input = state.directory.openInput(suggestFSTFile, state.context); + version = CodecUtil.checkHeader(input, CODEC_NAME, SUGGEST_CODEC_VERSION, SUGGEST_VERSION_CURRENT); + FieldsProducer delegateProducer = null; + boolean success = false; + try { + PostingsFormat delegatePostingsFormat = PostingsFormat.forName(input.readString()); + String providerName = input.readString(); + CompletionLookupProvider completionLookupProvider = providers.get(providerName); + if (completionLookupProvider == null) { + throw new IllegalStateException("no provider with name [" + providerName + "] registered"); + } + // TODO: we could clone the ReadState and make it always forward IOContext.MERGE to prevent unecessary heap usage? + delegateProducer = delegatePostingsFormat.fieldsProducer(state); + /* + * If we are merging we don't load the FSTs at all such that we + * don't consume so much memory during merge + */ + if (state.context.context != Context.MERGE) { + // TODO: maybe we can do this in a fully lazy fashion based on some configuration + // eventually we should have some kind of curciut breaker that prevents us from going OOM here + // with some configuration + this.lookupFactory = completionLookupProvider.load(input); + } else { + this.lookupFactory = null; + } + this.delegateProducer = delegateProducer; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(delegateProducer, input); + } else { + IOUtils.close(input); + } + } + } + + @Override + public void close() throws IOException { + IOUtils.close(delegateProducer); + } + + @Override + public Iterator<String> iterator() { + return delegateProducer.iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + final Terms terms = delegateProducer.terms(field); + if (terms == null || lookupFactory == null) { + return terms; + } + return new CompletionTerms(terms, lookupFactory); + } + + @Override + public int size() { + return delegateProducer.size(); + } + + @Override + public long ramBytesUsed() { + return (lookupFactory == null ? 0 : lookupFactory.ramBytesUsed()) + delegateProducer.ramBytesUsed(); + } + + @Override + public Collection<Accountable> getChildResources() { + List<Accountable> resources = new ArrayList<>(); + if (lookupFactory != null) { + resources.add(Accountables.namedAccountable("lookup", lookupFactory)); + } + resources.add(Accountables.namedAccountable("delegate", delegateProducer)); + return Collections.unmodifiableList(resources); + } + + @Override + public void checkIntegrity() throws IOException { + delegateProducer.checkIntegrity(); + } + + @Override + public FieldsProducer getMergeInstance() throws IOException { + return delegateProducer.getMergeInstance(); + } + } + + public static final class CompletionTerms extends FilterTerms { + private final LookupFactory lookup; + + public CompletionTerms(Terms delegate, LookupFactory lookup) { + super(delegate); + this.lookup = lookup; + } + + public Lookup getLookup(CompletionFieldMapper2x.CompletionFieldType mapper, CompletionSuggestionContext suggestionContext) { + return lookup.getLookup(mapper, suggestionContext); + } + + public CompletionStats stats(String ... fields) { + return lookup.stats(fields); + } + } + + public static abstract class CompletionLookupProvider implements PayloadProcessor, CompletionTokenStream.ToFiniteStrings { + + public static final char UNIT_SEPARATOR = '\u001f'; + + public abstract FieldsConsumer consumer(IndexOutput output) throws IOException; + + public abstract String getName(); + + public abstract LookupFactory load(IndexInput input) throws IOException; + + @Override + public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException { + if (weight < -1 || weight > Integer.MAX_VALUE) { + throw new IllegalArgumentException("weight must be >= -1 && <= Integer.MAX_VALUE"); + } + for (int i = 0; i < surfaceForm.length; i++) { + if (surfaceForm.bytes[i] == UNIT_SEPARATOR) { + throw new IllegalArgumentException( + "surface form cannot contain unit separator character U+001F; this character is reserved"); + } + } + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream); + output.writeVLong(weight + 1); + output.writeVInt(surfaceForm.length); + output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); + output.writeVInt(payload.length); + output.writeBytes(payload.bytes, 0, payload.length); + + output.close(); + return new BytesRef(byteArrayOutputStream.toByteArray()); + } + + @Override + public void parsePayload(BytesRef payload, SuggestPayload ref) throws IOException { + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length); + InputStreamDataInput input = new InputStreamDataInput(byteArrayInputStream); + ref.weight = input.readVLong() - 1; + int len = input.readVInt(); + ref.surfaceForm.grow(len); + ref.surfaceForm.setLength(len); + input.readBytes(ref.surfaceForm.bytes(), 0, ref.surfaceForm.length()); + len = input.readVInt(); + ref.payload.grow(len); + ref.payload.setLength(len); + input.readBytes(ref.payload.bytes(), 0, ref.payload.length()); + input.close(); + } + } + + /** + * Returns total in-heap bytes used by all suggesters. This method has CPU cost <code>O(numIndexedFields)</code>. + * + * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns will break out its in-heap bytes + * separately in the returned {@link CompletionStats} + */ + public CompletionStats completionStats(IndexReader indexReader, String ... fieldNamePatterns) { + CompletionStats completionStats = new CompletionStats(); + for (LeafReaderContext atomicReaderContext : indexReader.leaves()) { + LeafReader atomicReader = atomicReaderContext.reader(); + try { + Fields fields = atomicReader.fields(); + for (String fieldName : fields) { + Terms terms = fields.terms(fieldName); + if (terms instanceof CompletionTerms) { + CompletionTerms completionTerms = (CompletionTerms) terms; + completionStats.add(completionTerms.stats(fieldNamePatterns)); + } + } + } catch (IOException ioe) { + logger.error("Could not get completion stats", ioe); + } + } + + return completionStats; + } + + public static abstract class LookupFactory implements Accountable { + public abstract Lookup getLookup(CompletionFieldMapper2x.CompletionFieldType fieldType, + CompletionSuggestionContext suggestionContext); + public abstract CompletionStats stats(String ... fields); + abstract AnalyzingCompletionLookupProvider.AnalyzingSuggestHolder getAnalyzingSuggestHolder(MappedFieldType fieldType); + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion2x/CompletionSuggestion.java b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/CompletionSuggestion.java new file mode 100644 index 0000000000..565be4fcd9 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/CompletionSuggestion.java @@ -0,0 +1,144 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion2x; + +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentHelper; +import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.search.suggest.Suggest; + +import java.io.IOException; +import java.util.Map; + +/** + * + */ +public class CompletionSuggestion extends Suggest.Suggestion<CompletionSuggestion.Entry> { + + public static final int TYPE = 2; + + public CompletionSuggestion() { + } + + public CompletionSuggestion(String name, int size) { + super(name, size); + } + + @Override + public int getType() { + return TYPE; + } + + @Override + protected Entry newEntry() { + return new Entry(); + } + + public static class Entry extends org.elasticsearch.search.suggest.Suggest.Suggestion.Entry<CompletionSuggestion.Entry.Option> { + + public Entry(Text text, int offset, int length) { + super(text, offset, length); + } + + protected Entry() { + super(); + } + + @Override + protected Option newOption() { + return new Option(); + } + + public static class Option extends org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option { + private BytesReference payload; + + public Option(Text text, float score, BytesReference payload) { + super(text, score); + this.payload = payload; + } + + + protected Option() { + super(); + } + + public void setPayload(BytesReference payload) { + this.payload = payload; + } + + public BytesReference getPayload() { + return payload; + } + + public String getPayloadAsString() { + return payload.toUtf8(); + } + + public long getPayloadAsLong() { + return Long.parseLong(payload.toUtf8()); + } + + public double getPayloadAsDouble() { + return Double.parseDouble(payload.toUtf8()); + } + + public Map<String, Object> getPayloadAsMap() { + return XContentHelper.convertToMap(payload, false).v2(); + } + + @Override + public void setScore(float score) { + super.setScore(score); + } + + @Override + protected XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { + super.innerToXContent(builder, params); + if (payload != null && payload.length() > 0) { + XContentType contentType = XContentFactory.xContentType(payload); + if (contentType == null) { + // must be a string or number + builder.field("payload", payload.toUtf8()); + } else { + builder.rawField("payload", payload); + } + } + return builder; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + super.readFrom(in); + payload = in.readBytesReference(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeBytesReference(payload); + } + } + } + +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion2x/CompletionTokenStream.java b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/CompletionTokenStream.java new file mode 100644 index 0000000000..cbf34be838 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/CompletionTokenStream.java @@ -0,0 +1,176 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest.completion2x; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeReflector; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.CharsRefBuilder; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.Util; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Set; + +/** + * + */ +public final class CompletionTokenStream extends TokenStream { + + private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class); + private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class); + private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);; + + + private final TokenStream input; + private BytesRef payload; + private Iterator<IntsRef> finiteStrings; + private ToFiniteStrings toFiniteStrings; + private int posInc = -1; + private static final int MAX_PATHS = 256; + private CharTermAttribute charTermAttribute; + + public CompletionTokenStream(TokenStream input, BytesRef payload, ToFiniteStrings toFiniteStrings) { + // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume + // the input stream entirely in toFiniteStrings(input) + this.input = input; + this.payload = payload; + this.toFiniteStrings = toFiniteStrings; + } + + @Override + public boolean incrementToken() throws IOException { + clearAttributes(); + if (finiteStrings == null) { + Set<IntsRef> strings = toFiniteStrings.toFiniteStrings(input); + + if (strings.size() > MAX_PATHS) { + throw new IllegalArgumentException("TokenStream expanded to " + strings.size() + " finite strings. Only <= " + MAX_PATHS + + " finite strings are supported"); + } + posInc = strings.size(); + finiteStrings = strings.iterator(); + } + if (finiteStrings.hasNext()) { + posAttr.setPositionIncrement(posInc); + /* + * this posInc encodes the number of paths that this surface form + * produced. Multi Fields have the same surface form and therefore sum up + */ + posInc = 0; + Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8 + if (charTermAttribute != null) { + charTermAttribute.setLength(0); + charTermAttribute.append(bytesAtt.toUTF16()); + } + if (payload != null) { + payloadAttr.setPayload(this.payload); + } + return true; + } + + return false; + } + + @Override + public void end() throws IOException { + super.end(); + if (posInc == -1) { + input.end(); + } + } + + @Override + public void close() throws IOException { + input.close(); + } + + public static interface ToFiniteStrings { + public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException; + } + + @Override + public void reset() throws IOException { + super.reset(); + if (hasAttribute(CharTermAttribute.class)) { + // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion + charTermAttribute = getAttribute(CharTermAttribute.class); + } + finiteStrings = null; + posInc = -1; + } + + public interface ByteTermAttribute extends TermToBytesRefAttribute { + // marker interface + + /** + * Return the builder from which the term is derived. + */ + public BytesRefBuilder builder(); + + public CharSequence toUTF16(); + } + + public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute { + private final BytesRefBuilder bytes = new BytesRefBuilder(); + private CharsRefBuilder charsRef; + + @Override + public BytesRefBuilder builder() { + return bytes; + } + + @Override + public BytesRef getBytesRef() { + return bytes.get(); + } + + @Override + public void clear() { + bytes.clear(); + } + + @Override + public void reflectWith(AttributeReflector reflector) { + + } + + @Override + public void copyTo(AttributeImpl target) { + ByteTermAttributeImpl other = (ByteTermAttributeImpl) target; + other.bytes.copyBytes(bytes); + } + + @Override + public CharSequence toUTF16() { + if (charsRef == null) { + charsRef = new CharsRefBuilder(); + } + charsRef.copyUTF8Bytes(getBytesRef()); + return charsRef.get(); + } + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion2x/PayloadProcessor.java b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/PayloadProcessor.java new file mode 100644 index 0000000000..eb857ce61e --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/PayloadProcessor.java @@ -0,0 +1,38 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest.completion2x; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +import java.io.IOException; + +interface PayloadProcessor { + + BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException; + + void parsePayload(BytesRef payload, SuggestPayload ref) throws IOException; + + static class SuggestPayload { + final BytesRefBuilder payload = new BytesRefBuilder(); + long weight = 0; + final BytesRefBuilder surfaceForm = new BytesRefBuilder(); + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/CategoryContextMapping.java b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/CategoryContextMapping.java new file mode 100644 index 0000000000..775d8b031a --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/CategoryContextMapping.java @@ -0,0 +1,374 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest.completion2x.context; + +import org.apache.lucene.analysis.PrefixAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.XContentParser.Token; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.mapper.ParseContext.Document; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** + * The {@link CategoryContextMapping} is used to define a {@link ContextMapping} that + * references a field within a document. The value of the field in turn will be + * used to setup the suggestions made by the completion suggester. + */ +public class CategoryContextMapping extends ContextMapping { + + protected static final String TYPE = "category"; + + private static final String FIELD_FIELDNAME = "path"; + private static final String DEFAULT_FIELDNAME = "_type"; + + private static final Iterable<String> EMPTY_VALUES = Collections.emptyList(); + + private final String fieldName; + private final Iterable<String> defaultValues; + private final FieldConfig defaultConfig; + + /** + * Create a new {@link CategoryContextMapping} with the default field + * <code>[_type]</code> + */ + public CategoryContextMapping(String name) { + this(name, DEFAULT_FIELDNAME, EMPTY_VALUES); + } + + /** + * Create a new {@link CategoryContextMapping} with the default field + * <code>[_type]</code> + */ + public CategoryContextMapping(String name, String fieldName) { + this(name, fieldName, EMPTY_VALUES); + } + + /** + * Create a new {@link CategoryContextMapping} with the default field + * <code>[_type]</code> + */ + public CategoryContextMapping(String name, Iterable<String> defaultValues) { + this(name, DEFAULT_FIELDNAME, defaultValues); + } + + /** + * Create a new {@link CategoryContextMapping} with the default field + * <code>[_type]</code> + */ + public CategoryContextMapping(String name, String fieldName, Iterable<String> defaultValues) { + super(TYPE, name); + this.fieldName = fieldName; + this.defaultValues = defaultValues; + this.defaultConfig = new FieldConfig(fieldName, defaultValues, null); + } + + /** + * Name of the field used by this {@link CategoryContextMapping} + */ + public String getFieldName() { + return fieldName; + } + + public Iterable<? extends CharSequence> getDefaultValues() { + return defaultValues; + } + + @Override + public FieldConfig defaultConfig() { + return defaultConfig; + } + + /** + * Load the specification of a {@link CategoryContextMapping} + * + * @param name + * name of the field to use. If <code>null</code> default field + * will be used + * @return new {@link CategoryContextMapping} + */ + protected static CategoryContextMapping load(String name, Map<String, Object> config) throws ElasticsearchParseException { + CategoryContextMapping.Builder mapping = new CategoryContextMapping.Builder(name); + + Object fieldName = config.get(FIELD_FIELDNAME); + Object defaultValues = config.get(FIELD_MISSING); + + if (fieldName != null) { + mapping.fieldName(fieldName.toString()); + config.remove(FIELD_FIELDNAME); + } + + if (defaultValues != null) { + if (defaultValues instanceof Iterable) { + for (Object value : (Iterable) defaultValues) { + mapping.addDefaultValue(value.toString()); + } + } else { + mapping.addDefaultValue(defaultValues.toString()); + } + config.remove(FIELD_MISSING); + } + + return mapping.build(); + } + + @Override + protected XContentBuilder toInnerXContent(XContentBuilder builder, Params params) throws IOException { + if (fieldName != null) { + builder.field(FIELD_FIELDNAME, fieldName); + } + builder.startArray(FIELD_MISSING); + for (CharSequence value : defaultValues) { + builder.value(value); + } + builder.endArray(); + return builder; + } + + @Override + public ContextConfig parseContext(ParseContext parseContext, XContentParser parser) throws IOException, ElasticsearchParseException { + Token token = parser.currentToken(); + if (token == Token.VALUE_NULL) { + return new FieldConfig(fieldName, defaultValues, null); + } else if (token == Token.VALUE_STRING) { + return new FieldConfig(fieldName, null, Collections.singleton(parser.text())); + } else if (token == Token.VALUE_NUMBER) { + return new FieldConfig(fieldName, null, Collections.singleton(parser.text())); + } else if (token == Token.VALUE_BOOLEAN) { + return new FieldConfig(fieldName, null, Collections.singleton(parser.text())); + } else if (token == Token.START_ARRAY) { + ArrayList<String> values = new ArrayList<>(); + while((token = parser.nextToken()) != Token.END_ARRAY) { + values.add(parser.text()); + } + if(values.isEmpty()) { + throw new ElasticsearchParseException("FieldConfig must contain a least one category"); + } + return new FieldConfig(fieldName, null, values); + } else { + throw new ElasticsearchParseException("FieldConfig must be either [null], a string or a list of strings"); + } + } + + @Override + public FieldQuery parseQuery(String name, XContentParser parser) throws IOException, ElasticsearchParseException { + Iterable<? extends CharSequence> values; + Token token = parser.currentToken(); + if (token == Token.START_ARRAY) { + ArrayList<String> list = new ArrayList<>(); + while ((token = parser.nextToken()) != Token.END_ARRAY) { + list.add(parser.text()); + } + values = list; + } else if (token == Token.VALUE_NULL) { + values = defaultValues; + } else { + values = Collections.singleton(parser.text()); + } + + return new FieldQuery(name, values); + } + + public static FieldQuery query(String name, CharSequence... fieldvalues) { + return query(name, Arrays.asList(fieldvalues)); + } + + public static FieldQuery query(String name, Iterable<? extends CharSequence> fieldvalues) { + return new FieldQuery(name, fieldvalues); + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof CategoryContextMapping) { + CategoryContextMapping other = (CategoryContextMapping) obj; + if (this.fieldName.equals(other.fieldName)) { + return Objects.deepEquals(this.defaultValues, other.defaultValues); + } + } + return false; + } + + @Override + public int hashCode() { + int hashCode = fieldName.hashCode(); + for (CharSequence seq : defaultValues) { + hashCode = 31 * hashCode + seq.hashCode(); + } + return hashCode; + } + + private static class FieldConfig extends ContextConfig { + + private final String fieldname; + private final Iterable<String> defaultValues; + private final Iterable<String> values; + + public FieldConfig(String fieldname, Iterable<String> defaultValues, Iterable<String> values) { + this.fieldname = fieldname; + this.defaultValues = defaultValues; + this.values = values; + } + + @Override + protected TokenStream wrapTokenStream(Document doc, TokenStream stream) { + if (values != null) { + return new PrefixAnalyzer.PrefixTokenFilter(stream, ContextMapping.SEPARATOR, values); + // if fieldname is default, BUT our default values are set, we take that one + } else if ((doc.getFields(fieldname).length == 0 + || fieldname.equals(DEFAULT_FIELDNAME)) && defaultValues.iterator().hasNext()) { + return new PrefixAnalyzer.PrefixTokenFilter(stream, ContextMapping.SEPARATOR, defaultValues); + } else { + IndexableField[] fields = doc.getFields(fieldname); + ArrayList<CharSequence> values = new ArrayList<>(fields.length); + for (int i = 0; i < fields.length; i++) { + values.add(fields[i].stringValue()); + } + + return new PrefixAnalyzer.PrefixTokenFilter(stream, ContextMapping.SEPARATOR, values); + } + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("FieldConfig(" + fieldname + " = ["); + if (this.values != null && this.values.iterator().hasNext()) { + final Iterator<String> valuesIterator = this.values.iterator(); + sb.append("("); + while (valuesIterator.hasNext()) { + sb.append(valuesIterator.next()); + if (valuesIterator.hasNext()) { + sb.append(", "); + } + } + sb.append(")"); + } + if (this.defaultValues != null && this.defaultValues.iterator().hasNext()) { + final Iterator<String> defaultValuesIterator = this.defaultValues.iterator(); + sb.append(" default("); + while (defaultValuesIterator.hasNext()) { + sb.append(defaultValuesIterator.next()); + if (defaultValuesIterator.hasNext()) { + sb.append(", "); + } + } + sb.append(")"); + } + return sb.append("])").toString(); + } + + } + + private static class FieldQuery extends ContextQuery { + + private final Iterable<? extends CharSequence> values; + + public FieldQuery(String name, Iterable<? extends CharSequence> values) { + super(name); + this.values = values; + } + + @Override + public Automaton toAutomaton() { + List<Automaton> automatons = new ArrayList<>(); + for (CharSequence value : values) { + automatons.add(Automata.makeString(value.toString())); + } + return Operations.union(automatons); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startArray(name); + for (CharSequence value : values) { + builder.value(value); + } + builder.endArray(); + return builder; + } + } + + public static class Builder extends ContextBuilder<CategoryContextMapping> { + + private String fieldname; + private List<String> defaultValues = new ArrayList<>(); + + public Builder(String name) { + this(name, DEFAULT_FIELDNAME); + } + + public Builder(String name, String fieldname) { + super(name); + this.fieldname = fieldname; + } + + /** + * Set the name of the field to use + */ + public Builder fieldName(String fieldname) { + this.fieldname = fieldname; + return this; + } + + /** + * Add value to the default values of the mapping + */ + public Builder addDefaultValue(String defaultValue) { + this.defaultValues.add(defaultValue); + return this; + } + + /** + * Add set of default values to the mapping + */ + public Builder addDefaultValues(String... defaultValues) { + Collections.addAll(this.defaultValues, defaultValues); + return this; + } + + /** + * Add set of default values to the mapping + */ + public Builder addDefaultValues(Iterable<String> defaultValues) { + for (String defaultValue : defaultValues) { + this.defaultValues.add(defaultValue); + } + return this; + } + + @Override + public CategoryContextMapping build() { + return new CategoryContextMapping(name, fieldname, defaultValues); + } + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/ContextBuilder.java b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/ContextBuilder.java new file mode 100644 index 0000000000..16ef0053bb --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/ContextBuilder.java @@ -0,0 +1,135 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest.completion2x.context; + +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.Version; +import org.elasticsearch.index.mapper.DocumentMapperParser; + +import java.util.Map; +import java.util.Map.Entry; +import java.util.SortedMap; +import java.util.TreeMap; + +public abstract class ContextBuilder<E extends ContextMapping> { + + protected String name; + + public ContextBuilder(String name) { + this.name = name; + } + + public abstract E build(); + + /** + * Create a new {@link GeolocationContextMapping} + */ + public static GeolocationContextMapping.Builder location(String name) { + return new GeolocationContextMapping.Builder(name); + } + + /** + * Create a new {@link GeolocationContextMapping} with given precision and + * neighborhood usage + * + * @param precision geohash length + * @param neighbors use neighbor cells + */ + public static GeolocationContextMapping.Builder location(String name, int precision, boolean neighbors) { + return new GeolocationContextMapping.Builder(name, neighbors, precision); + } + + /** + * Create a new {@link CategoryContextMapping} + */ + public static CategoryContextMapping.Builder category(String name) { + return new CategoryContextMapping.Builder(name, null); + } + + /** + * Create a new {@link CategoryContextMapping} with default category + * + * @param defaultCategory category to use, if it is not provided + */ + public static CategoryContextMapping.Builder category(String name, String defaultCategory) { + return new CategoryContextMapping.Builder(name, null).addDefaultValue(defaultCategory); + } + + /** + * Create a new {@link CategoryContextMapping} + * + * @param fieldname + * name of the field to use + */ + public static CategoryContextMapping.Builder reference(String name, String fieldname) { + return new CategoryContextMapping.Builder(name, fieldname); + } + + /** + * Create a new {@link CategoryContextMapping} + * + * @param fieldname name of the field to use + * @param defaultValues values to use, if the document not provides + * a field with the given name + */ + public static CategoryContextMapping.Builder reference(String name, String fieldname, Iterable<String> defaultValues) { + return new CategoryContextMapping.Builder(name, fieldname).addDefaultValues(defaultValues); + } + + public static SortedMap<String, ContextMapping> loadMappings(Object configuration, Version indexVersionCreated) + throws ElasticsearchParseException { + if (configuration instanceof Map) { + Map<String, Object> configurations = (Map<String, Object>)configuration; + SortedMap<String, ContextMapping> mappings = new TreeMap<>(); + for (Entry<String,Object> config : configurations.entrySet()) { + String name = config.getKey(); + mappings.put(name, loadMapping(name, (Map<String, Object>) config.getValue(), indexVersionCreated)); + } + return mappings; + } else if (configuration == null) { + return ContextMapping.EMPTY_MAPPING; + } else { + throw new ElasticsearchParseException("no valid context configuration"); + } + } + + protected static ContextMapping loadMapping(String name, Map<String, Object> config, Version indexVersionCreated) + throws ElasticsearchParseException { + final Object argType = config.get(ContextMapping.FIELD_TYPE); + + if (argType == null) { + throw new ElasticsearchParseException("missing [{}] in context mapping", ContextMapping.FIELD_TYPE); + } + + final String type = argType.toString(); + ContextMapping contextMapping; + if (GeolocationContextMapping.TYPE.equals(type)) { + contextMapping = GeolocationContextMapping.load(name, config); + } else if (CategoryContextMapping.TYPE.equals(type)) { + contextMapping = CategoryContextMapping.load(name, config); + } else { + throw new ElasticsearchParseException("unknown context type [{}]", type); + } + config.remove(ContextMapping.FIELD_TYPE); + DocumentMapperParser.checkNoRemainingFields(name, config, indexVersionCreated); + + return contextMapping; + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/ContextMapping.java b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/ContextMapping.java new file mode 100644 index 0000000000..dd23bdecb0 --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/ContextMapping.java @@ -0,0 +1,319 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest.completion2x.context; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.fst.FST; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.XContentParser.Token; +import org.elasticsearch.common.xcontent.json.JsonXContent; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.mapper.ParseContext.Document; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.SortedMap; + +/** + * A {@link ContextMapping} is used t define a context that may used + * in conjunction with a suggester. To define a suggester that depends on a + * specific context derived class of {@link ContextMapping} will be + * used to specify the kind of additional information required in order to make + * suggestions. + */ +public abstract class ContextMapping implements ToXContent { + + /** Character used to separate several contexts */ + public static final char SEPARATOR = '\u001D'; + + /** Dummy Context Mapping that should be used if no context is used*/ + public static final SortedMap<String, ContextMapping> EMPTY_MAPPING = Collections.emptySortedMap(); + + /** Dummy Context Config matching the Dummy Mapping by providing an empty context*/ + public static final SortedMap<String, ContextConfig> EMPTY_CONFIG = Collections.emptySortedMap(); + + /** Dummy Context matching the Dummy Mapping by not wrapping a {@link TokenStream} */ + public static final Context EMPTY_CONTEXT = new Context(EMPTY_CONFIG, null); + + public static final String FIELD_VALUE = "value"; + public static final String FIELD_MISSING = "default"; + public static final String FIELD_TYPE = "type"; + + protected final String type; // Type of the Contextmapping + protected final String name; + + /** + * Define a new context mapping of a specific type + * + * @param type + * name of the new context mapping + */ + protected ContextMapping(String type, String name) { + super(); + this.type = type; + this.name = name; + } + + /** + * @return the type name of the context + */ + protected String type() { + return type; + } + + /** + * @return the name/id of the context + */ + public String name() { + return name; + } + + @Override + public final XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(name); + builder.field(FIELD_TYPE, type); + toInnerXContent(builder, params); + builder.endObject(); + return builder; + } + + /** + * A {@link ContextMapping} combined with the information provided by a document + * form a {@link ContextConfig} which is used to build the underlying FST. + * + * @param parseContext context of parsing phase + * @param parser {@link XContentParser} used to read and setup the configuration + * @return A {@link ContextConfig} related to <b>this</b> mapping + * + */ + public abstract ContextConfig parseContext(ParseContext parseContext, XContentParser parser) + throws IOException, ElasticsearchParseException; + + public abstract ContextConfig defaultConfig(); + + /** + * Parse a query according to the context. Parsing starts at parsers <b>current</b> position + * + * @param name name of the context + * @param parser {@link XContentParser} providing the data of the query + * + * @return {@link ContextQuery} according to this mapping + * + */ + public abstract ContextQuery parseQuery(String name, XContentParser parser) throws IOException, ElasticsearchParseException; + + /** + * Since every context mapping is assumed to have a name given by the field name of an context object, this + * method is used to build the value used to serialize the mapping + * + * @param builder builder to append the mapping to + * @param params parameters passed to the builder + * + * @return the builder used + * + */ + protected abstract XContentBuilder toInnerXContent(XContentBuilder builder, Params params) throws IOException; + + /** + * Test equality of two mapping + * + * @param thisMappings first mapping + * @param otherMappings second mapping + * + * @return true if both arguments are equal + */ + public static boolean mappingsAreEqual(SortedMap<String, ? extends ContextMapping> thisMappings, + SortedMap<String, ? extends ContextMapping> otherMappings) { + return Objects.equals(thisMappings, otherMappings); + } + + @Override + public String toString() { + try { + return toXContent(JsonXContent.contentBuilder(), ToXContent.EMPTY_PARAMS).string(); + } catch (IOException e) { + return super.toString(); + } + } + + /** + * A collection of {@link ContextMapping}s, their {@link ContextConfig}uration and a + * Document form a complete {@link Context}. Since this Object provides all information used + * to setup a suggestion, it can be used to wrap the entire {@link TokenStream} used to build a + * path within the {@link FST}. + */ + public static class Context { + + final SortedMap<String, ContextConfig> contexts; + final Document doc; + + public Context(SortedMap<String, ContextConfig> contexts, Document doc) { + super(); + this.contexts = contexts; + this.doc = doc; + } + + /** + * Wrap the {@link TokenStream} according to the provided informations of {@link ContextConfig} + * and a related {@link Document}. + * + * @param tokenStream {@link TokenStream} to wrap + * + * @return wrapped token stream + */ + public TokenStream wrapTokenStream(TokenStream tokenStream) { + for (ContextConfig context : contexts.values()) { + tokenStream = context.wrapTokenStream(doc, tokenStream); + } + return tokenStream; + } + } + + /** + * A {@link ContextMapping} combined with the information provided by a document + * form a {@link ContextConfig} which is used to build the underlying {@link FST}. This class hold + * a simple method wrapping a {@link TokenStream} by provided document informations. + */ + public static abstract class ContextConfig { + + /** + * Wrap a {@link TokenStream} for building suggestions to use context informations + * provided by a document or a {@link ContextMapping} + * + * @param doc document related to the stream + * @param stream original stream used to build the underlying {@link FST} + * + * @return A new {@link TokenStream} providing additional context information + */ + protected abstract TokenStream wrapTokenStream(Document doc, TokenStream stream); + + } + + /** + * A {@link ContextQuery} defines the context information for a specific {@link ContextMapping} + * defined within a suggestion request. According to the parameters set in the request and the + * {@link ContextMapping} such a query is used to wrap the {@link TokenStream} of the actual + * suggestion request into a {@link TokenStream} with the context settings + */ + public static abstract class ContextQuery implements ToXContent { + + protected final String name; + + protected ContextQuery(String name) { + this.name = name; + } + + public String name() { + return name; + } + + /** + * Create a automaton for a given context query this automaton will be used + * to find the matching paths with the fst + * + * @param preserveSep set an additional char (<code>XAnalyzingSuggester.SEP_LABEL</code>) between each context query + * @param queries list of {@link ContextQuery} defining the lookup context + * + * @return Automaton matching the given Query + */ + public static Automaton toAutomaton(boolean preserveSep, Iterable<ContextQuery> queries) { + Automaton a = Automata.makeEmptyString(); + + Automaton gap = Automata.makeChar(ContextMapping.SEPARATOR); + if (preserveSep) { + // if separators are preserved the fst contains a SEP_LABEL + // behind each gap. To have a matching automaton, we need to + // include the SEP_LABEL in the query as well + gap = Operations.concatenate(gap, Automata.makeChar(XAnalyzingSuggester.SEP_LABEL)); + } + + for (ContextQuery query : queries) { + a = Operations.concatenate(Arrays.asList(query.toAutomaton(), gap, a)); + } + + // TODO: should we limit this? Do any of our ContextQuery impls really create exponential regexps? + // GeoQuery looks safe (union of strings). + return Operations.determinize(a, Integer.MAX_VALUE); + } + + /** + * Build a LookUp Automaton for this context. + * @return LookUp Automaton + */ + protected abstract Automaton toAutomaton(); + + /** + * Parse a set of {@link ContextQuery} according to a given mapping + * @param mappings List of mapping defined y the suggest field + * @param parser parser holding the settings of the queries. The parsers + * current token is assumed hold an array. The number of elements + * in this array must match the number of elements in the mappings. + * @return List of context queries + * + * @throws IOException if something unexpected happened on the underlying stream + * @throws ElasticsearchParseException if the list of queries could not be parsed + */ + public static List<ContextQuery> parseQueries(Map<String, ContextMapping> mappings, XContentParser parser) + throws IOException, ElasticsearchParseException { + + Map<String, ContextQuery> querySet = new HashMap<>(); + Token token = parser.currentToken(); + if(token == Token.START_OBJECT) { + while ((token = parser.nextToken()) != Token.END_OBJECT) { + String name = parser.currentName(); + ContextMapping mapping = mappings.get(name); + if (mapping == null) { + throw new ElasticsearchParseException("no mapping defined for [{}]", name); + } + parser.nextToken(); + querySet.put(name, mapping.parseQuery(name, parser)); + } + } + + List<ContextQuery> queries = new ArrayList<>(mappings.size()); + for (ContextMapping mapping : mappings.values()) { + queries.add(querySet.get(mapping.name)); + } + return queries; + } + + @Override + public String toString() { + try { + return toXContent(JsonXContent.contentBuilder(), ToXContent.EMPTY_PARAMS).string(); + } catch (IOException e) { + return super.toString(); + } + } + } +} diff --git a/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/GeolocationContextMapping.java b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/GeolocationContextMapping.java new file mode 100644 index 0000000000..e131efce9e --- /dev/null +++ b/core/src/main/java/org/elasticsearch/search/suggest/completion2x/context/GeolocationContextMapping.java @@ -0,0 +1,750 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest.completion2x.context; + +import com.carrotsearch.hppc.IntHashSet; +import org.apache.lucene.analysis.PrefixAnalyzer.PrefixTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.spatial.geopoint.document.GeoPointField; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.fst.FST; +import org.elasticsearch.ElasticsearchParseException; +import org.elasticsearch.common.geo.GeoHashUtils; +import org.elasticsearch.common.geo.GeoPoint; +import org.elasticsearch.common.geo.GeoUtils; +import org.elasticsearch.common.unit.DistanceUnit; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.XContentParser.Token; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.mapper.ParseContext.Document; +import org.elasticsearch.index.mapper.geo.GeoPointFieldMapper; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; + +/** + * The {@link GeolocationContextMapping} allows to take GeoInfomation into account + * during building suggestions. The mapping itself works with geohashes + * explicitly and is configured by three parameters: + * <ul> + * <li><code>precision</code>: length of the geohash indexed as prefix of the + * completion field</li> + * <li><code>neighbors</code>: Should the neighbor cells of the deepest geohash + * level also be indexed as alternatives to the actual geohash</li> + * <li><code>location</code>: (optional) location assumed if it is not provided</li> + * </ul> + * Internally this mapping wraps the suggestions into a form + * <code>[geohash][suggestion]</code>. If the neighbor option is set the cells + * next to the cell on the deepest geohash level ( <code>precision</code>) will + * be indexed as well. The {@link TokenStream} used to build the {@link FST} for + * suggestion will be wrapped into a {@link PrefixTokenFilter} managing these + * geohases as prefixes. + */ +public class GeolocationContextMapping extends ContextMapping { + + public static final String TYPE = "geo"; + + public static final String FIELD_PRECISION = "precision"; + public static final String FIELD_NEIGHBORS = "neighbors"; + public static final String FIELD_FIELDNAME = "path"; + + private final Collection<String> defaultLocations; + private final int[] precision; + private final boolean neighbors; + private final String fieldName; + private final GeoConfig defaultConfig; + + /** + * Create a new {@link GeolocationContextMapping} with a given precision + * + * @param precision + * length of the geohashes + * @param neighbors + * should neighbors be indexed + * @param defaultLocations + * location to use, if it is not provided by the document + */ + protected GeolocationContextMapping(String name, int[] precision, boolean neighbors, + Collection<String> defaultLocations, String fieldName) { + super(TYPE, name); + this.precision = precision; + this.neighbors = neighbors; + this.defaultLocations = defaultLocations; + this.fieldName = fieldName; + this.defaultConfig = new GeoConfig(this, defaultLocations); + } + + /** + * load a {@link GeolocationContextMapping} by configuration. Such a configuration + * can set the parameters + * <ul> + * <li>precision [<code>String</code>, <code>Double</code>, + * <code>Float</code> or <code>Integer</code>] defines the length of the + * underlying geohash</li> + * <li>defaultLocation [<code>String</code>] defines the location to use if + * it is not provided by the document</li> + * <li>neighbors [<code>Boolean</code>] defines if the last level of the + * geohash should be extended by neighbor cells</li> + * </ul> + * + * @param config + * Configuration for {@link GeolocationContextMapping} + * @return new {@link GeolocationContextMapping} configured by the parameters of + * <code>config</code> + */ + protected static GeolocationContextMapping load(String name, Map<String, Object> config) { + if (!config.containsKey(FIELD_PRECISION)) { + throw new ElasticsearchParseException("field [precision] is missing"); + } + + final GeolocationContextMapping.Builder builder = new GeolocationContextMapping.Builder(name); + + if (config != null) { + final Object configPrecision = config.get(FIELD_PRECISION); + if (configPrecision == null) { + // ignore precision + } else if (configPrecision instanceof Integer) { + builder.precision((Integer) configPrecision); + config.remove(FIELD_PRECISION); + } else if (configPrecision instanceof Long) { + builder.precision((Long) configPrecision); + config.remove(FIELD_PRECISION); + } else if (configPrecision instanceof Double) { + builder.precision((Double) configPrecision); + config.remove(FIELD_PRECISION); + } else if (configPrecision instanceof Float) { + builder.precision((Float) configPrecision); + config.remove(FIELD_PRECISION); + } else if (configPrecision instanceof Iterable) { + for (Object precision : (Iterable)configPrecision) { + if (precision instanceof Integer) { + builder.precision((Integer) precision); + } else if (precision instanceof Long) { + builder.precision((Long) precision); + } else if (precision instanceof Double) { + builder.precision((Double) precision); + } else if (precision instanceof Float) { + builder.precision((Float) precision); + } else { + builder.precision(precision.toString()); + } + } + config.remove(FIELD_PRECISION); + } else { + builder.precision(configPrecision.toString()); + config.remove(FIELD_PRECISION); + } + + final Object configNeighbors = config.get(FIELD_NEIGHBORS); + if (configNeighbors != null) { + builder.neighbors((Boolean) configNeighbors); + config.remove(FIELD_NEIGHBORS); + } + + final Object def = config.get(FIELD_MISSING); + if (def != null) { + if (def instanceof Iterable) { + for (Object location : (Iterable)def) { + builder.addDefaultLocation(location.toString()); + } + } else if (def instanceof String) { + builder.addDefaultLocation(def.toString()); + } else if (def instanceof Map) { + Map<String, Object> latlonMap = (Map<String, Object>) def; + if (!latlonMap.containsKey("lat") || !(latlonMap.get("lat") instanceof Double)) { + throw new ElasticsearchParseException( + "field [{}] map must have field lat and a valid latitude", FIELD_MISSING); + } + if (!latlonMap.containsKey("lon") || !(latlonMap.get("lon") instanceof Double)) { + throw new ElasticsearchParseException( + "field [{}] map must have field lon and a valid longitude", FIELD_MISSING); + } + builder.addDefaultLocation( + Double.valueOf(latlonMap.get("lat").toString()), Double.valueOf(latlonMap.get("lon").toString())); + } else { + throw new ElasticsearchParseException("field [{}] must be of type string or list", FIELD_MISSING); + } + config.remove(FIELD_MISSING); + } + + final Object fieldName = config.get(FIELD_FIELDNAME); + if (fieldName != null) { + builder.field(fieldName.toString()); + config.remove(FIELD_FIELDNAME); + } + } + return builder.build(); + } + + @Override + protected XContentBuilder toInnerXContent(XContentBuilder builder, Params params) throws IOException { + builder.field(FIELD_PRECISION, precision); + builder.field(FIELD_NEIGHBORS, neighbors); + if (defaultLocations != null) { + builder.startArray(FIELD_MISSING); + for (String defaultLocation : defaultLocations) { + builder.value(defaultLocation); + } + builder.endArray(); + } + if (fieldName != null) { + builder.field(FIELD_FIELDNAME, fieldName); + } + return builder; + } + + protected static Collection<String> parseSinglePointOrList(XContentParser parser) throws IOException { + Token token = parser.currentToken(); + if(token == Token.START_ARRAY) { + token = parser.nextToken(); + // Test if value is a single point in <code>[lon, lat]</code> format + if(token == Token.VALUE_NUMBER) { + double lon = parser.doubleValue(); + if(parser.nextToken() == Token.VALUE_NUMBER) { + double lat = parser.doubleValue(); + if(parser.nextToken() == Token.END_ARRAY) { + return Collections.singleton(GeoHashUtils.stringEncode(lon, lat)); + } else { + throw new ElasticsearchParseException("only two values expected"); + } + } else { + throw new ElasticsearchParseException("latitue must be a numeric value"); + } + } else { + // otherwise it's a list of locations + ArrayList<String> result = new ArrayList<>(); + while (token != Token.END_ARRAY) { + result.add(GeoUtils.parseGeoPoint(parser).geohash()); + token = parser.nextToken(); //infinite loop without this line + } + return result; + } + } else { + // or a single location + return Collections.singleton(GeoUtils.parseGeoPoint(parser).geohash()); + } + } + + @Override + public ContextConfig defaultConfig() { + return defaultConfig; + } + + @Override + public ContextConfig parseContext(ParseContext parseContext, XContentParser parser) + throws IOException, ElasticsearchParseException { + + if(fieldName != null) { + FieldMapper mapper = parseContext.docMapper().mappers().getMapper(fieldName); + if(!(mapper instanceof GeoPointFieldMapper)) { + throw new ElasticsearchParseException("referenced field must be mapped to geo_point"); + } + } + + Collection<String> locations; + if(parser.currentToken() == Token.VALUE_NULL) { + locations = null; + } else { + locations = parseSinglePointOrList(parser); + } + return new GeoConfig(this, locations); + } + + /** + * Create a new geolocation query from a given GeoPoint + * + * @param point + * query location + * @return new geolocation query + */ + public static GeoQuery query(String name, GeoPoint point) { + return query(name, point.getGeohash()); + } + + /** + * Create a new geolocation query from a given geocoordinate + * + * @param lat + * latitude of the location + * @param lon + * longitude of the location + * @return new geolocation query + */ + public static GeoQuery query(String name, double lat, double lon, int ... precisions) { + return query(name, GeoHashUtils.stringEncode(lon, lat), precisions); + } + + public static GeoQuery query(String name, double lat, double lon, String ... precisions) { + int precisionInts[] = new int[precisions.length]; + for (int i = 0 ; i < precisions.length; i++) { + precisionInts[i] = GeoUtils.geoHashLevelsForPrecision(precisions[i]); + } + return query(name, GeoHashUtils.stringEncode(lon, lat), precisionInts); + } + + /** + * Create a new geolocation query from a given geohash + * + * @param geohash + * geohash of the location + * @return new geolocation query + */ + public static GeoQuery query(String name, String geohash, int ... precisions) { + return new GeoQuery(name, geohash, precisions); + } + + private static final int parsePrecision(XContentParser parser) throws IOException, ElasticsearchParseException { + switch (parser.currentToken()) { + case VALUE_STRING: + return GeoUtils.geoHashLevelsForPrecision(parser.text()); + case VALUE_NUMBER: + switch (parser.numberType()) { + case INT: + case LONG: + return parser.intValue(); + default: + return GeoUtils.geoHashLevelsForPrecision(parser.doubleValue()); + } + default: + throw new ElasticsearchParseException("invalid precision value"); + } + } + + @Override + public GeoQuery parseQuery(String name, XContentParser parser) throws IOException, ElasticsearchParseException { + if (parser.currentToken() == Token.START_OBJECT) { + double lat = Double.NaN; + double lon = Double.NaN; + GeoPoint point = null; + int[] precision = null; + + while (parser.nextToken() != Token.END_OBJECT) { + final String fieldName = parser.currentName(); + if("lat".equals(fieldName)) { + if(point == null) { + parser.nextToken(); + switch (parser.currentToken()) { + case VALUE_NUMBER: + case VALUE_STRING: + lat = parser.doubleValue(true); + break; + default: + throw new ElasticsearchParseException("latitude must be a number"); + } + } else { + throw new ElasticsearchParseException("only lat/lon or [{}] is allowed", FIELD_VALUE); + } + } else if ("lon".equals(fieldName)) { + if(point == null) { + parser.nextToken(); + switch (parser.currentToken()) { + case VALUE_NUMBER: + case VALUE_STRING: + lon = parser.doubleValue(true); + break; + default: + throw new ElasticsearchParseException("longitude must be a number"); + } + } else { + throw new ElasticsearchParseException("only lat/lon or [{}] is allowed", FIELD_VALUE); + } + } else if (FIELD_PRECISION.equals(fieldName)) { + if(parser.nextToken() == Token.START_ARRAY) { + IntHashSet precisions = new IntHashSet(); + while(parser.nextToken() != Token.END_ARRAY) { + precisions.add(parsePrecision(parser)); + } + precision = precisions.toArray(); + } else { + precision = new int[] { parsePrecision(parser) }; + } + } else if (FIELD_VALUE.equals(fieldName)) { + if(Double.isNaN(lon) && Double.isNaN(lat)) { + parser.nextToken(); + point = GeoUtils.parseGeoPoint(parser); + } else { + throw new ElasticsearchParseException("only lat/lon or [{}] is allowed", FIELD_VALUE); + } + } else { + throw new ElasticsearchParseException("unexpected fieldname [{}]", fieldName); + } + } + + if (point == null) { + if (Double.isNaN(lat) || Double.isNaN(lon)) { + throw new ElasticsearchParseException("location is missing"); + } else { + point = new GeoPoint(lat, lon); + } + } + + if (precision == null || precision.length == 0) { + precision = this.precision; + } + + return new GeoQuery(name, point.geohash(), precision); + } else { + return new GeoQuery(name, GeoUtils.parseGeoPoint(parser).getGeohash(), precision); + } + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((defaultLocations == null) ? 0 : defaultLocations.hashCode()); + result = prime * result + ((fieldName == null) ? 0 : fieldName.hashCode()); + result = prime * result + (neighbors ? 1231 : 1237); + result = prime * result + Arrays.hashCode(precision); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + GeolocationContextMapping other = (GeolocationContextMapping) obj; + if (defaultLocations == null) { + if (other.defaultLocations != null) + return false; + } else if (!defaultLocations.equals(other.defaultLocations)) + return false; + if (fieldName == null) { + if (other.fieldName != null) + return false; + } else if (!fieldName.equals(other.fieldName)) + return false; + if (neighbors != other.neighbors) + return false; + if (!Arrays.equals(precision, other.precision)) + return false; + return true; + } + + + + + public static class Builder extends ContextBuilder<GeolocationContextMapping> { + + private IntHashSet precisions = new IntHashSet(); + private boolean neighbors; // take neighbor cell on the lowest level into account + private HashSet<String> defaultLocations = new HashSet<>(); + private String fieldName = null; + + protected Builder(String name) { + this(name, true, null); + } + + protected Builder(String name, boolean neighbors, int...levels) { + super(name); + neighbors(neighbors); + if (levels != null) { + for (int level : levels) { + precision(level); + } + } + } + + /** + * Set the precision use o make suggestions + * + * @param precision + * precision as distance with {@link DistanceUnit}. Default: + * meters + * @return this + */ + public Builder precision(String precision) { + return precision(DistanceUnit.parse(precision, DistanceUnit.METERS, DistanceUnit.METERS)); + } + + /** + * Set the precision use o make suggestions + * + * @param precision + * precision value + * @param unit + * {@link DistanceUnit} to use + * @return this + */ + public Builder precision(double precision, DistanceUnit unit) { + return precision(unit.toMeters(precision)); + } + + /** + * Set the precision use o make suggestions + * + * @param meters + * precision as distance in meters + * @return this + */ + public Builder precision(double meters) { + int level = GeoUtils.geoHashLevelsForPrecision(meters); + // Ceiling precision: we might return more results + if (GeoUtils.geoHashCellSize(level) < meters) { + level = Math.max(1, level - 1); + } + return precision(level); + } + + /** + * Set the precision use o make suggestions + * + * @param level + * maximum length of geohashes + * @return this + */ + public Builder precision(int level) { + this.precisions.add(level); + return this; + } + + /** + * Set neighborhood usage + * + * @param neighbors + * should neighbor cells also be valid + * @return this + */ + public Builder neighbors(boolean neighbors) { + this.neighbors = neighbors; + return this; + } + + /** + * Set a default location that should be used, if no location is + * provided by the query + * + * @param geohash + * geohash of the default location + * @return this + */ + public Builder addDefaultLocation(String geohash) { + this.defaultLocations.add(geohash); + return this; + } + + /** + * Set a default location that should be used, if no location is + * provided by the query + * + * @param geohashes + * geohash of the default location + * @return this + */ + public Builder addDefaultLocations(Collection<String> geohashes) { + this.defaultLocations.addAll(geohashes); + return this; + } + + /** + * Set a default location that should be used, if no location is + * provided by the query + * + * @param lat + * latitude of the default location + * @param lon + * longitude of the default location + * @return this + */ + public Builder addDefaultLocation(double lat, double lon) { + this.defaultLocations.add(GeoHashUtils.stringEncode(lon, lat)); + return this; + } + + /** + * Set a default location that should be used, if no location is + * provided by the query + * + * @param point + * location + * @return this + */ + public Builder defaultLocation(GeoPoint point) { + this.defaultLocations.add(point.geohash()); + return this; + } + + /** + * Set the name of the field containing a geolocation to use + * @param fieldName name of the field + * @return this + */ + public Builder field(String fieldName) { + this.fieldName = fieldName; + return this; + } + + @Override + public GeolocationContextMapping build() { + if(precisions.isEmpty()) { + precisions.add(GeoHashUtils.PRECISION); + } + int[] precisionArray = precisions.toArray(); + Arrays.sort(precisionArray); + return new GeolocationContextMapping(name, precisionArray, neighbors, defaultLocations, fieldName); + } + + } + + private static class GeoConfig extends ContextConfig { + + private final GeolocationContextMapping mapping; + private final Collection<String> locations; + + public GeoConfig(GeolocationContextMapping mapping, Collection<String> locations) { + this.locations = locations; + this.mapping = mapping; + } + + @Override + protected TokenStream wrapTokenStream(Document doc, TokenStream stream) { + Collection<String> geohashes; + + if (locations == null || locations.size() == 0) { + if(mapping.fieldName != null) { + IndexableField[] fields = doc.getFields(mapping.fieldName); + if(fields.length == 0) { + IndexableField[] lonFields = doc.getFields(mapping.fieldName + ".lon"); + IndexableField[] latFields = doc.getFields(mapping.fieldName + ".lat"); + if (lonFields.length > 0 && latFields.length > 0) { + geohashes = new ArrayList<>(fields.length); + GeoPoint spare = new GeoPoint(); + for (int i = 0 ; i < lonFields.length ; i++) { + IndexableField lonField = lonFields[i]; + IndexableField latField = latFields[i]; + assert lonField.fieldType().docValuesType() == latField.fieldType().docValuesType(); + // we write doc values fields differently: one field for all values, + // so we need to only care about indexed fields + if (lonField.fieldType().docValuesType() == DocValuesType.NONE) { + spare.reset(latField.numericValue().doubleValue(), lonField.numericValue().doubleValue()); + geohashes.add(spare.geohash()); + } + } + } else { + geohashes = mapping.defaultLocations; + } + } else { + geohashes = new ArrayList<>(fields.length); + GeoPoint spare = new GeoPoint(); + for (IndexableField field : fields) { + if (field instanceof StringField) { + spare.resetFromString(field.stringValue()); + } else if (field instanceof GeoPointField) { + GeoPointField geoPointField = (GeoPointField) field; + spare.reset(geoPointField.getLat(), geoPointField.getLon()); + } else { + spare.resetFromString(field.stringValue()); + } + geohashes.add(spare.geohash()); + } + } + } else { + geohashes = mapping.defaultLocations; + } + } else { + geohashes = locations; + } + + Collection<String> locations = new HashSet<>(); + for (String geohash : geohashes) { + for (int p : mapping.precision) { + int precision = Math.min(p, geohash.length()); + String truncatedGeohash = geohash.substring(0, precision); + if(mapping.neighbors) { + GeoHashUtils.addNeighbors(truncatedGeohash, precision, locations); + } + locations.add(truncatedGeohash); + } + } + + return new PrefixTokenFilter(stream, ContextMapping.SEPARATOR, locations); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("GeoConfig(location = ["); + Iterator<? extends CharSequence> location = this.locations.iterator(); + if (location.hasNext()) { + sb.append(location.next()); + while (location.hasNext()) { + sb.append(", ").append(location.next()); + } + } + return sb.append("])").toString(); + } + } + + private static class GeoQuery extends ContextQuery { + private final String location; + private final int[] precisions; + + public GeoQuery(String name, String location, int...precisions) { + super(name); + this.location = location; + this.precisions = precisions; + } + + @Override + public Automaton toAutomaton() { + Automaton automaton; + if(precisions == null || precisions.length == 0) { + automaton = Automata.makeString(location); + } else { + automaton = Automata.makeString( + location.substring(0, Math.max(1, Math.min(location.length(), precisions[0])))); + for (int i = 1; i < precisions.length; i++) { + final String cell = location.substring(0, Math.max(1, Math.min(location.length(), precisions[i]))); + automaton = Operations.union(automaton, Automata.makeString(cell)); + } + } + return automaton; + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + if(precisions == null || precisions.length == 0) { + builder.field(name, location); + } else { + builder.startObject(name); + builder.field(FIELD_VALUE, location); + builder.field(FIELD_PRECISION, precisions); + builder.endObject(); + } + return builder; + } + } +} |