summaryrefslogtreecommitdiff
path: root/core/src/main/java/org/apache/lucene/search/suggest/analyzing/XAnalyzingSuggester.java
blob: a6077f8e84259bafc3c1fd4965c2f6e05bc219c6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

import com.carrotsearch.hppc.ObjectIntHashMap;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.*;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;
import org.elasticsearch.common.collect.HppcMaps;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;

/**
 * Suggester that first analyzes the surface form, adds the
 * analyzed form to a weighted FST, and then does the same
 * thing at lookup time.  This means lookup is based on the
 * analyzed form while suggestions are still the surface
 * form(s).
 *
 * <p>
 * This can result in powerful suggester functionality.  For
 * example, if you use an analyzer removing stop words, 
 * then the partial text "ghost chr..." could see the
 * suggestion "The Ghost of Christmas Past".  Note that
 * position increments MUST NOT be preserved for this example
 * to work, so you should call the constructor with
 * <code>preservePositionIncrements</code> parameter set to
 * false
 *
 * <p>
 * If SynonymFilter is used to map wifi and wireless network to
 * hotspot then the partial text "wirele..." could suggest
 * "wifi router".  Token normalization like stemmers, accent
 * removal, etc., would allow suggestions to ignore such
 * variations.
 *
 * <p>
 * When two matching suggestions have the same weight, they
 * are tie-broken by the analyzed form.  If their analyzed
 * form is the same then the order is undefined.
 *
 * <p>
 * There are some limitations:
 * <ul>
 *
 *   <li> A lookup from a query like "net" in English won't
 *        be any different than "net " (ie, user added a
 *        trailing space) because analyzers don't reflect
 *        when they've seen a token separator and when they
 *        haven't.
 *
 *   <li> If you're using {@code StopFilter}, and the user will
 *        type "fast apple", but so far all they've typed is
 *        "fast a", again because the analyzer doesn't convey whether
 *        it's seen a token separator after the "a",
 *        {@code StopFilter} will remove that "a" causing
 *        far more matches than you'd expect.
 *
 *   <li> Lookups with the empty string return no results
 *        instead of all results.
 * </ul>
 * 
 * @lucene.experimental
 */
public class XAnalyzingSuggester extends Lookup {

  /**
   * FST&lt;Weight,Surface&gt;: 
   *  input is the analyzed form, with a null byte between terms
   *  weights are encoded as costs: (Integer.MAX_VALUE-weight)
   *  surface is the original, unanalyzed form.
   */
  private FST<Pair<Long,BytesRef>> fst = null;
  
  /** 
   * Analyzer that will be used for analyzing suggestions at
   * index time.
   */
  private final Analyzer indexAnalyzer;

  /** 
   * Analyzer that will be used for analyzing suggestions at
   * query time.
   */
  private final Analyzer queryAnalyzer;
  
  /** 
   * True if exact match suggestions should always be returned first.
   */
  private final boolean exactFirst;
  
  /** 
   * True if separator between tokens should be preserved.
   */
  private final boolean preserveSep;

  /** Include this flag in the options parameter to {@code
   *  #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to always
   *  return the exact match first, regardless of score.  This
   *  has no performance impact but could result in
   *  low-quality suggestions. */
  public static final int EXACT_FIRST = 1;

  /** Include this flag in the options parameter to {@code
   *  #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)} to preserve
   *  token separators when matching. */
  public static final int PRESERVE_SEP = 2;

  /** Represents the separation between tokens, if
   *  PRESERVE_SEP was specified */
  public static final int SEP_LABEL = '\u001F';

  /** Marks end of the analyzed input and start of dedup
   *  byte. */
  public static final int END_BYTE = 0x0;

  /** Maximum number of dup surface forms (different surface
   *  forms for the same analyzed form). */
  private final int maxSurfaceFormsPerAnalyzedForm;

  /** Maximum graph paths to index for a single analyzed
   *  surface form.  This only matters if your analyzer
   *  makes lots of alternate paths (e.g. contains
   *  SynonymFilter). */
  private final int maxGraphExpansions;

  /** Highest number of analyzed paths we saw for any single
   *  input surface form.  For analyzers that never create
   *  graphs this will always be 1. */
  private int maxAnalyzedPathsForOneInput;

  private boolean hasPayloads;

  private final int sepLabel;
  private final int payloadSep;
  private final int endByte;
  private final int holeCharacter;

  public static final int PAYLOAD_SEP = '\u001F';
  public static final int HOLE_CHARACTER = '\u001E';
  
  private final Automaton queryPrefix;

  /** Whether position holes should appear in the automaton. */
  private boolean preservePositionIncrements;

  /** Number of entries the lookup was built with */
  private long count = 0;

    /**
   * Calls {@code #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
   * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
   * PRESERVE_SEP, 256, -1)}
   */
  public XAnalyzingSuggester(Analyzer analyzer) {
    this(analyzer, null, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
  }

  /**
   * Calls {@code #XAnalyzingSuggester(Analyzer,Analyzer,int,int,int,boolean,FST,boolean,int,int,int,int,int)
   * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST |
   * PRESERVE_SEP, 256, -1)}
   */
  public XAnalyzingSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
    this(indexAnalyzer, null, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, null, false, 0, SEP_LABEL, PAYLOAD_SEP, END_BYTE, HOLE_CHARACTER);
  }

  /**
   * Creates a new suggester.
   * 
   * @param indexAnalyzer Analyzer that will be used for
   *   analyzing suggestions while building the index.
   * @param queryAnalyzer Analyzer that will be used for
   *   analyzing query text during lookup
   * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
   * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
   *   surface forms to keep for a single analyzed form.
   *   When there are too many surface forms we discard the
   *   lowest weighted ones.
   * @param maxGraphExpansions Maximum number of graph paths
   *   to expand from the analyzed form.  Set this to -1 for
   *   no limit.
   */
  public XAnalyzingSuggester(Analyzer indexAnalyzer, Automaton queryPrefix, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                             boolean preservePositionIncrements, FST<Pair<Long, BytesRef>> fst, boolean hasPayloads, int maxAnalyzedPathsForOneInput,
                             int sepLabel, int payloadSep, int endByte, int holeCharacter) {
      // SIMON EDIT: I added fst, hasPayloads and maxAnalyzedPathsForOneInput 
    this.indexAnalyzer = indexAnalyzer;
    this.queryAnalyzer = queryAnalyzer;
    this.fst = fst;
    this.hasPayloads = hasPayloads;
    if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) {
      throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options);
    }
    this.exactFirst = (options & EXACT_FIRST) != 0;
    this.preserveSep = (options & PRESERVE_SEP) != 0;

    // FLORIAN EDIT: I added <code>queryPrefix</code> for context dependent suggestions
    this.queryPrefix = queryPrefix;
    
    // NOTE: this is just an implementation limitation; if
    // somehow this is a problem we could fix it by using
    // more than one byte to disambiguate ... but 256 seems
    // like it should be way more then enough.
    if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) {
      throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")");
    }
    this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;

    if (maxGraphExpansions < 1 && maxGraphExpansions != -1) {
      throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")");
    }
    this.maxGraphExpansions = maxGraphExpansions;
    this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
    this.preservePositionIncrements = preservePositionIncrements;
    this.sepLabel = sepLabel;
    this.payloadSep = payloadSep;
    this.endByte = endByte;
    this.holeCharacter = holeCharacter;
  }

  /** Returns byte size of the underlying FST. */
  @Override
public long ramBytesUsed() {
    return fst == null ? 0 : fst.ramBytesUsed();
  }
  
  public int getMaxAnalyzedPathsForOneInput() {
      return maxAnalyzedPathsForOneInput;
  }

  // Replaces SEP with epsilon or remaps them if
  // we were asked to preserve them:
  private Automaton replaceSep(Automaton a) {

      Automaton result = new Automaton();

      // Copy all states over
      int numStates = a.getNumStates();
      for(int s=0;s<numStates;s++) {
        result.createState();
        result.setAccept(s, a.isAccept(s));
      }

      // Go in reverse topo sort so we know we only have to
      // make one pass:
      Transition t = new Transition();
      int[] topoSortStates = topoSortStates(a);
      for(int i=0;i<topoSortStates.length;i++) {
        int state = topoSortStates[topoSortStates.length-1-i];
        int count = a.initTransition(state, t);
        for(int j=0;j<count;j++) {
          a.getNextTransition(t);
          if (t.min == TokenStreamToAutomaton.POS_SEP) {
            assert t.max == TokenStreamToAutomaton.POS_SEP;
            if (preserveSep) {
              // Remap to SEP_LABEL:
              result.addTransition(state, t.dest, SEP_LABEL);
            } else {
              result.addEpsilon(state, t.dest);
            }
          } else if (t.min == TokenStreamToAutomaton.HOLE) {
            assert t.max == TokenStreamToAutomaton.HOLE;

            // Just remove the hole: there will then be two
            // SEP tokens next to each other, which will only
            // match another hole at search time.  Note that
            // it will also match an empty-string token ... if
            // that's somehow a problem we can always map HOLE
            // to a dedicated byte (and escape it in the
            // input).
            result.addEpsilon(state, t.dest);
          } else {
            result.addTransition(state, t.dest, t.min, t.max);
          }
        }
      }

      result.finishState();

      return result;
  }

  protected Automaton convertAutomaton(Automaton a) {
    if (queryPrefix != null) {
      a = Operations.concatenate(Arrays.asList(queryPrefix, a));
      // This automaton should not blow up during determinize:
      a = Operations.determinize(a, Integer.MAX_VALUE);
    }
    return a;
  }
  
  private int[] topoSortStates(Automaton a) {
      int[] states = new int[a.getNumStates()];
      final Set<Integer> visited = new HashSet<>();
      final LinkedList<Integer> worklist = new LinkedList<>();
      worklist.add(0);
      visited.add(0);
      int upto = 0;
      states[upto] = 0;
      upto++;
      Transition t = new Transition();
      while (worklist.size() > 0) {
        int s = worklist.removeFirst();
        int count = a.initTransition(s, t);
        for (int i=0;i<count;i++) {
          a.getNextTransition(t);
          if (!visited.contains(t.dest)) {
            visited.add(t.dest);
            worklist.add(t.dest);
            states[upto++] = t.dest;
          }
        }
      }
      return states;
    }

  /** Just escapes the 0xff byte (which we still for SEP). */
  private static final class  EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {

    final BytesRefBuilder spare = new BytesRefBuilder();
    private char sepLabel;

    public EscapingTokenStreamToAutomaton(char sepLabel) {
      this.sepLabel = sepLabel;
    }

    @Override
    protected BytesRef changeToken(BytesRef in) {
      int upto = 0;
      for(int i=0;i<in.length;i++) {
        byte b = in.bytes[in.offset+i];
        if (b == (byte) sepLabel) {
          spare.grow(upto+2);
          spare.setByteAt(upto++, (byte) sepLabel);
          spare.setByteAt(upto++, b);
        } else {
          spare.grow(upto+1);
          spare.setByteAt(upto++, b);
        }
      }
      spare.setLength(upto);
      return spare.get();
    }
  }

  public TokenStreamToAutomaton getTokenStreamToAutomaton() {
    final TokenStreamToAutomaton tsta;
    if (preserveSep) {
      tsta = new EscapingTokenStreamToAutomaton((char) sepLabel);
    } else {
      // When we're not preserving sep, we don't steal 0xff
      // byte, so we don't need to do any escaping:
      tsta = new TokenStreamToAutomaton();
    }
    tsta.setPreservePositionIncrements(preservePositionIncrements);
    return tsta;
  }
  
  private static class AnalyzingComparator implements Comparator<BytesRef> {

    private final boolean hasPayloads;

    public AnalyzingComparator(boolean hasPayloads) {
      this.hasPayloads = hasPayloads;
    }

    private final ByteArrayDataInput readerA = new ByteArrayDataInput();
    private final ByteArrayDataInput readerB = new ByteArrayDataInput();
    private final BytesRef scratchA = new BytesRef();
    private final BytesRef scratchB = new BytesRef();

    @Override
    public int compare(BytesRef a, BytesRef b) {

      // First by analyzed form:
      readerA.reset(a.bytes, a.offset, a.length);
      scratchA.length = readerA.readShort();
      scratchA.bytes = a.bytes;
      scratchA.offset = readerA.getPosition();

      readerB.reset(b.bytes, b.offset, b.length);
      scratchB.bytes = b.bytes;
      scratchB.length = readerB.readShort();
      scratchB.offset = readerB.getPosition();

      int cmp = scratchA.compareTo(scratchB);
      if (cmp != 0) {
        return cmp;
      }
      readerA.skipBytes(scratchA.length);
      readerB.skipBytes(scratchB.length);
      // Next by cost:
      long aCost = readerA.readInt();
      long bCost = readerB.readInt();
      if (aCost < bCost) {
        return -1;
      } else if (aCost > bCost) {
        return 1;
      }

      // Finally by surface form:
      if (hasPayloads) {
        scratchA.length = readerA.readShort();
        scratchA.offset = readerA.getPosition();
        scratchB.length = readerB.readShort();
        scratchB.offset = readerB.getPosition();
      } else {
        scratchA.offset = readerA.getPosition();
        scratchA.length = a.length - scratchA.offset;
        scratchB.offset = readerB.getPosition();
        scratchB.length = b.length - scratchB.offset;
      }
      return scratchA.compareTo(scratchB);
    }
  }

  @Override
  public void build(InputIterator iterator) throws IOException {
    String prefix = getClass().getSimpleName();
    Path directory = OfflineSorter.getDefaultTempDir();
    Path tempInput = Files.createTempFile(directory, prefix, ".input");
    Path tempSorted = Files.createTempFile(directory, prefix, ".sorted");

    hasPayloads = iterator.hasPayloads();

    OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
    OfflineSorter.ByteSequencesReader reader = null;
    BytesRefBuilder scratch = new BytesRefBuilder();

    TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();

    boolean success = false;
    count = 0;
    byte buffer[] = new byte[8];
    try {
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);

      for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null;) {
        LimitedFiniteStringsIterator finiteStrings =
                new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
        for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
          Util.toBytesRef(string, scratch);
          
          // length of the analyzed text (FST input)
          if (scratch.length() > Short.MAX_VALUE-2) {
            throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length() + ")");
          }
          short analyzedLength = (short) scratch.length();

          // compute the required length:
          // analyzed sequence + weight (4) + surface + analyzedLength (short)
          int requiredLength = analyzedLength + 4 + surfaceForm.length + 2;

          BytesRef payload;

          if (hasPayloads) {
            if (surfaceForm.length > (Short.MAX_VALUE-2)) {
              throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
            }
            payload = iterator.payload();
            // payload + surfaceLength (short)
            requiredLength += payload.length + 2;
          } else {
            payload = null;
          }
          
          buffer = ArrayUtil.grow(buffer, requiredLength);
          
          output.reset(buffer);

          output.writeShort(analyzedLength);

          output.writeBytes(scratch.bytes(), 0, scratch.length());

          output.writeInt(encodeWeight(iterator.weight()));

          if (hasPayloads) {
            for(int i=0;i<surfaceForm.length;i++) {
              if (surfaceForm.bytes[i] == payloadSep) {
                throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
              }
            }
            output.writeShort((short) surfaceForm.length);
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
            output.writeBytes(payload.bytes, payload.offset, payload.length);
          } else {
            output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
          }

          assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength;

          writer.write(buffer, 0, output.getPosition());
        }
        maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size());
      }
      writer.close();

      // Sort all input/output pairs (required by FST.Builder):
      new OfflineSorter(new AnalyzingComparator(hasPayloads)).sort(tempInput, tempSorted);

      // Free disk space:
      Files.delete(tempInput);

      reader = new OfflineSorter.ByteSequencesReader(tempSorted);
     
      PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
      Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);

      // Build FST:
      BytesRefBuilder previousAnalyzed = null;
      BytesRefBuilder analyzed = new BytesRefBuilder();
      BytesRef surface = new BytesRef();
      IntsRefBuilder scratchInts = new IntsRefBuilder();
      ByteArrayDataInput input = new ByteArrayDataInput();

      // Used to remove duplicate surface forms (but we
      // still index the hightest-weight one).  We clear
      // this when we see a new analyzed form, so it cannot
      // grow unbounded (at most 256 entries):
      Set<BytesRef> seenSurfaceForms = new HashSet<>();

      int dedup = 0;
      while (reader.read(scratch)) {
        input.reset(scratch.bytes(), 0, scratch.length());
        short analyzedLength = input.readShort();
        analyzed.grow(analyzedLength+2);
        input.readBytes(analyzed.bytes(), 0, analyzedLength);
        analyzed.setLength(analyzedLength);

        long cost = input.readInt();

        surface.bytes = scratch.bytes();
        if (hasPayloads) {
          surface.length = input.readShort();
          surface.offset = input.getPosition();
        } else {
          surface.offset = input.getPosition();
          surface.length = scratch.length() - surface.offset;
        }
        
        if (previousAnalyzed == null) {
          previousAnalyzed = new BytesRefBuilder();
          previousAnalyzed.copyBytes(analyzed);
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else if (analyzed.get().equals(previousAnalyzed.get())) {
          dedup++;
          if (dedup >= maxSurfaceFormsPerAnalyzedForm) {
            // More than maxSurfaceFormsPerAnalyzedForm
            // dups: skip the rest:
            continue;
          }
          if (seenSurfaceForms.contains(surface)) {
            continue;
          }
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        } else {
          dedup = 0;
          previousAnalyzed.copyBytes(analyzed);
          seenSurfaceForms.clear();
          seenSurfaceForms.add(BytesRef.deepCopyOf(surface));
        }

        // TODO: I think we can avoid the extra 2 bytes when
        // there is no dup (dedup==0), but we'd have to fix
        // the exactFirst logic ... which would be sort of
        // hairy because we'd need to special case the two
        // (dup/not dup)...

        // NOTE: must be byte 0 so we sort before whatever
        // is next
        analyzed.append((byte) 0);
        analyzed.append((byte) dedup);

        Util.toIntsRef(analyzed.get(), scratchInts);
        //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
        if (!hasPayloads) {
          builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
        } else {
          int payloadOffset = input.getPosition() + surface.length;
          int payloadLength = scratch.length() - payloadOffset;
          BytesRef br = new BytesRef(surface.length + 1 + payloadLength);
          System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
          br.bytes[surface.length] = (byte) payloadSep;
          System.arraycopy(scratch.bytes(), payloadOffset, br.bytes, surface.length+1, payloadLength);
          br.length = br.bytes.length;
          builder.add(scratchInts.get(), outputs.newPair(cost, br));
        }
      }
      fst = builder.finish();

      //PrintWriter pw = new PrintWriter("/tmp/out.dot");
      //Util.toDot(fst, pw, true, true);
      //pw.close();

      success = true;
    } finally {
      IOUtils.closeWhileHandlingException(reader, writer);
        
      if (success) {
        IOUtils.deleteFilesIfExist(tempInput, tempSorted);
      } else {
        IOUtils.deleteFilesIgnoringExceptions(tempInput, tempSorted);
      }
    }
  }

  @Override
  public boolean store(OutputStream output) throws IOException {
    DataOutput dataOut = new OutputStreamDataOutput(output);
    try {
      if (fst == null) {
        return false;
      }

      fst.save(dataOut);
      dataOut.writeVInt(maxAnalyzedPathsForOneInput);
      dataOut.writeByte((byte) (hasPayloads ? 1 : 0));
    } finally {
      IOUtils.close(output);
    }
    return true;
  }

    @Override
    public long getCount() {
        return count;
    }

    @Override
  public boolean load(InputStream input) throws IOException {
    DataInput dataIn = new InputStreamDataInput(input);
    try {
      this.fst = new FST<>(dataIn, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
      maxAnalyzedPathsForOneInput = dataIn.readVInt();
      hasPayloads = dataIn.readByte() == 1;
    } finally {
      IOUtils.close(input);
    }
    return true;
  }

  private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
    LookupResult result;
    if (hasPayloads) {
      int sepIndex = -1;
      for(int i=0;i<output2.length;i++) {
        if (output2.bytes[output2.offset+i] == payloadSep) {
          sepIndex = i;
          break;
        }
      }
      assert sepIndex != -1;
      final int payloadLen = output2.length - sepIndex - 1;
      spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
      BytesRef payload = new BytesRef(payloadLen);
      System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
      payload.length = payloadLen;
      result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
    } else {
      spare.copyUTF8Bytes(output2);
      result = new LookupResult(spare.toString(), decodeWeight(output1));
    }

    return result;
  }

  private boolean sameSurfaceForm(BytesRef key, BytesRef output2) {
    if (hasPayloads) {
      // output2 has at least PAYLOAD_SEP byte:
      if (key.length >= output2.length) {
        return false;
      }
      for(int i=0;i<key.length;i++) {
        if (key.bytes[key.offset+i] != output2.bytes[output2.offset+i]) {
          return false;
        }
      }
      return output2.bytes[output2.offset + key.length] == payloadSep;
    } else {
      return key.bytesEquals(output2);
    }
  }

  @Override
  public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
    assert num > 0;

    if (onlyMorePopular) {
      throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
    }
    if (fst == null) {
      return Collections.emptyList();
    }

    //System.out.println("lookup key=" + key + " num=" + num);
    for (int i = 0; i < key.length(); i++) {
      if (key.charAt(i) == holeCharacter) {
        throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
      }
      if (key.charAt(i) == sepLabel) {
        throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
      }
    }
    final BytesRef utf8Key = new BytesRef(key);
    try {

      Automaton lookupAutomaton = toLookupAutomaton(key);

      final CharsRefBuilder spare = new CharsRefBuilder();

      //System.out.println("  now intersect exactFirst=" + exactFirst);
    
      // Intersect automaton w/ suggest wFST and get all
      // prefix starting nodes & their outputs:
      //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);

      //System.out.println("  prefixPaths: " + prefixPaths.size());

      BytesReader bytesReader = fst.getBytesReader();

      FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<>();

      final List<LookupResult> results = new ArrayList<>();

      List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);

      if (exactFirst) {

        int count = 0;
        for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
          if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            count++;
          }
        }

        // Searcher just to find the single exact only
        // match, if present:
        Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
        searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);

        // NOTE: we could almost get away with only using
        // the first start node.  The only catch is if
        // maxSurfaceFormsPerAnalyzedForm had kicked in and
        // pruned our exact match from one of these nodes
        // ...:
        for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
          if (fst.findTargetArc(endByte, path.fstNode, scratchArc, bytesReader) != null) {
            // This node has END_BYTE arc leaving, meaning it's an
            // "exact" match:
            searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
          }
        }

        Util.TopResults<Pair<Long,BytesRef>> completions = searcher.search();

        // NOTE: this is rather inefficient: we enumerate
        // every matching "exactly the same analyzed form"
        // path, and then do linear scan to see if one of
        // these exactly matches the input.  It should be
        // possible (though hairy) to do something similar
        // to getByOutput, since the surface form is encoded
        // into the FST output, so we more efficiently hone
        // in on the exact surface-form match.  Still, I
        // suspect very little time is spent in this linear
        // seach: it's bounded by how many prefix start
        // nodes we have and the
        // maxSurfaceFormsPerAnalyzedForm:
        for(Result<Pair<Long,BytesRef>> completion : completions) {
          BytesRef output2 = completion.output.output2;
          if (sameSurfaceForm(utf8Key, output2)) {
            results.add(getLookupResult(completion.output.output1, output2, spare));
            break;
          }
        }

        if (results.size() == num) {
          // That was quick:
          return results;
        }
      }

      Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
      searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
                                                            num - results.size(),
                                                            num * maxAnalyzedPathsForOneInput,
                                                            weightComparator) {
        private final Set<BytesRef> seen = new HashSet<>();

        @Override
        protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {

          // Dedup: when the input analyzes to a graph we
          // can get duplicate surface forms:
          if (seen.contains(output.output2)) {
            return false;
          }
          seen.add(output.output2);
          
          if (!exactFirst) {
            return true;
          } else {
            // In exactFirst mode, don't accept any paths
            // matching the surface form since that will
            // create duplicate results:
            if (sameSurfaceForm(utf8Key, output.output2)) {
              // We found exact match, which means we should
              // have already found it in the first search:
              assert results.size() == 1;
              return false;
            } else {
              return true;
            }
          }
        }
      };

      prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
      
      for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
        searcher.addStartPaths(path.fstNode, path.output, true, path.input);
      }

      TopResults<Pair<Long,BytesRef>> completions = searcher.search();

      for(Result<Pair<Long,BytesRef>> completion : completions) {

        LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);

        // TODO: for fuzzy case would be nice to return
        // how many edits were required

        //System.out.println("    result=" + result);
        results.add(result);

        if (results.size() == num) {
          // In the exactFirst=true case the search may
          // produce one extra path
          break;
        }
      }

      return results;
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
  }

  @Override
  public boolean store(DataOutput output) throws IOException {
    output.writeVLong(count);
    if (fst == null) {
      return false;
    }

    fst.save(output);
    output.writeVInt(maxAnalyzedPathsForOneInput);
    output.writeByte((byte) (hasPayloads ? 1 : 0));
    return true;
  }

  @Override
  public boolean load(DataInput input) throws IOException {
    count = input.readVLong();
    this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
    maxAnalyzedPathsForOneInput = input.readVInt();
    hasPayloads = input.readByte() == 1;
    return true;
  }

    /** Returns all completion paths to initialize the search. */
  protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                       Automaton lookupAutomaton,
                                                                       FST<Pair<Long,BytesRef>> fst)
    throws IOException {
    return prefixPaths;
  }

  final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
      try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
          return toAutomaton(ts, ts2a);
      }
  }

  final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException {
      // Create corresponding automaton: labels are bytes
      // from each analyzed token, with byte 0 used as
      // separator between tokens:
      Automaton automaton = ts2a.toAutomaton(ts);

      automaton = replaceSep(automaton);
      automaton = convertAutomaton(automaton);

      // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
      // assert SpecialOperations.isFinite(automaton);

      // Get all paths from the automaton (there can be
      // more than one path, eg if the analyzer created a
      // graph using SynFilter or WDF):

      return automaton;
  }

  // EDIT: Adrien, needed by lookup providers
  // NOTE: these XForks are unmaintainable, we need to get rid of them...
  public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
      final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
      Automaton automaton;
      try (TokenStream ts = stream) {
          automaton = toAutomaton(ts, ts2a);
      }
      LimitedFiniteStringsIterator finiteStrings =
              new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
      Set<IntsRef> set = new HashSet<>();
      for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
          set.add(IntsRef.deepCopyOf(string));
      }
      return Collections.unmodifiableSet(set);
  }

  final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
      // TODO: is there a Reader from a CharSequence?
      // Turn tokenstream into automaton:
      Automaton automaton = null;
      
      try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
          automaton = getTokenStreamToAutomaton().toAutomaton(ts);
      }

      automaton = replaceSep(automaton);

      // TODO: we can optimize this somewhat by determinizing
      // while we convert

      // This automaton should not blow up during determinize:
      automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
      return automaton;
  }
  
  

  /**
   * Returns the weight associated with an input string,
   * or null if it does not exist.
   */
  public Object get(CharSequence key) {
    throw new UnsupportedOperationException();
  }
  
  /** cost -&gt; weight */
  public static int decodeWeight(long encoded) {
    return (int)(Integer.MAX_VALUE - encoded);
  }
  
  /** weight -&gt; cost */
  public static int encodeWeight(long value) {
    if (value < 0 || value > Integer.MAX_VALUE) {
      throw new UnsupportedOperationException("cannot encode value: " + value);
    }
    return Integer.MAX_VALUE - (int)value;
  }
   
  static final Comparator<Pair<Long,BytesRef>> weightComparator = new Comparator<Pair<Long,BytesRef>> () {
    @Override
    public int compare(Pair<Long,BytesRef> left, Pair<Long,BytesRef> right) {
      return left.output1.compareTo(right.output1);
    }
  };
  
  
    public static class XBuilder {
        private Builder<Pair<Long, BytesRef>> builder;
        private int maxSurfaceFormsPerAnalyzedForm;
        private IntsRefBuilder scratchInts = new IntsRefBuilder();
        private final PairOutputs<Long, BytesRef> outputs;
        private boolean hasPayloads;
        private BytesRefBuilder analyzed = new BytesRefBuilder();
        private final SurfaceFormAndPayload[] surfaceFormsAndPayload;
        private int count;
        private ObjectIntHashMap<BytesRef> seenSurfaceForms = HppcMaps.Object.Integer.ensureNoNullKeys(256, 0.75f);
        private int payloadSep;

        public XBuilder(int maxSurfaceFormsPerAnalyzedForm, boolean hasPayloads, int payloadSep) {
            this.payloadSep = payloadSep;
            this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
            this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
            this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
            this.hasPayloads = hasPayloads;
            surfaceFormsAndPayload = new SurfaceFormAndPayload[maxSurfaceFormsPerAnalyzedForm];

        }
        public void startTerm(BytesRef analyzed) {
            this.analyzed.grow(analyzed.length+2);
            this.analyzed.copyBytes(analyzed);
        }
        
        private final static class SurfaceFormAndPayload implements Comparable<SurfaceFormAndPayload> {
            BytesRef payload;
            long weight;
            
            public SurfaceFormAndPayload(BytesRef payload, long cost) {
                super();
                this.payload = payload;
                this.weight = cost;
            }

            @Override
            public int compareTo(SurfaceFormAndPayload o) {
                int res = compare(weight, o.weight);
                if (res == 0 ){
                    return payload.compareTo(o.payload);
                }
                return res;
            }
            public static int compare(long x, long y) {
                return (x < y) ? -1 : ((x == y) ? 0 : 1);
            }
        }

        public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException {
            int surfaceIndex = -1;
            long encodedWeight = cost == -1 ? cost : encodeWeight(cost);
            /*
             * we need to check if we have seen this surface form, if so only use the 
             * the surface form with the highest weight and drop the rest no matter if 
             * the payload differs.
             */
            if (count >= maxSurfaceFormsPerAnalyzedForm) {
                // More than maxSurfaceFormsPerAnalyzedForm
                // dups: skip the rest:
                return;
            }

            BytesRef surfaceCopy;
            final int keySlot;
            if (count > 0 && (keySlot = seenSurfaceForms.indexOf(surface)) >= 0) {
                surfaceIndex = seenSurfaceForms.indexGet(keySlot);
                SurfaceFormAndPayload surfaceFormAndPayload = surfaceFormsAndPayload[surfaceIndex];
                if (encodedWeight >= surfaceFormAndPayload.weight) {
                    return;
                }
                surfaceCopy = BytesRef.deepCopyOf(surface);
            } else {
                surfaceIndex = count++;
                surfaceCopy = BytesRef.deepCopyOf(surface);
                seenSurfaceForms.put(surfaceCopy, surfaceIndex);
            }
           
            BytesRef payloadRef;
            if (!hasPayloads) {
                payloadRef = surfaceCopy;
            } else {
                int len = surface.length + 1 + payload.length;
                final BytesRef br = new BytesRef(len);
                System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length);
                br.bytes[surface.length] = (byte) payloadSep;
                System.arraycopy(payload.bytes, payload.offset, br.bytes, surface.length + 1, payload.length);
                br.length = len;
                payloadRef = br;
            }
            if (surfaceFormsAndPayload[surfaceIndex] == null) {
                surfaceFormsAndPayload[surfaceIndex] = new SurfaceFormAndPayload(payloadRef, encodedWeight);
            } else {
                surfaceFormsAndPayload[surfaceIndex].payload = payloadRef;
                surfaceFormsAndPayload[surfaceIndex].weight = encodedWeight;
            }
        }
        
        public void finishTerm(long defaultWeight) throws IOException {
            ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
            int deduplicator = 0;
            analyzed.append((byte) 0);
            analyzed.setLength(analyzed.length() + 1);
            analyzed.grow(analyzed.length());
            for (int i = 0; i < count; i++) {
                analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
                Util.toIntsRef(analyzed.get(), scratchInts);
                SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
                long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
                builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
            }
            seenSurfaceForms.clear();
            count = 0;
        }

        public FST<Pair<Long, BytesRef>> build() throws IOException {
            return builder.finish();
        }

        public boolean hasPayloads() {
            return hasPayloads;
        }

        public int maxSurfaceFormsPerAnalyzedForm() {
            return maxSurfaceFormsPerAnalyzedForm;
        }

    }
}