diff options
author | Andy Bristol <andy.bristol@elastic.co> | 2017-06-13 12:46:59 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-06-13 12:46:59 -0700 |
commit | 48696ab544e32245c806d798fe56a7afcffa0e47 (patch) | |
tree | feba4e7b927b815909cf5e7d4800d1c3a6d23d07 /test/framework | |
parent | 190242fb1b9f1d42196a7b80a8627de81a301417 (diff) |
expose simple pattern tokenizers (#25159)
Expose the experimental simplepattern and
simplepatternsplit tokenizers in the common
analysis plugin. They provide tokenization based
on regular expressions, using Lucene's
deterministic regex implementation that is usually
faster than Java's and has protections against
creating too-deep stacks during matching.
Both have a not-very-useful default pattern of the
empty string because all tokenizer factories must
be able to be instantiated at index creation time.
They should always be configured by the user
in practice.
Diffstat (limited to 'test/framework')
-rw-r--r-- | test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java | 28 |
1 files changed, 13 insertions, 15 deletions
diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index fd8a5e7cd9..a3fe52d005 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -129,25 +129,23 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>() // exposed in ES - .put("classic", ClassicTokenizerFactory.class) - .put("edgengram", EdgeNGramTokenizerFactory.class) - .put("keyword", KeywordTokenizerFactory.class) - .put("letter", LetterTokenizerFactory.class) - .put("lowercase", LowerCaseTokenizerFactory.class) - .put("ngram", NGramTokenizerFactory.class) + .put("classic", ClassicTokenizerFactory.class) + .put("edgengram", EdgeNGramTokenizerFactory.class) + .put("keyword", KeywordTokenizerFactory.class) + .put("letter", LetterTokenizerFactory.class) + .put("lowercase", LowerCaseTokenizerFactory.class) + .put("ngram", NGramTokenizerFactory.class) .put("pathhierarchy", PathHierarchyTokenizerFactory.class) - .put("pattern", PatternTokenizerFactory.class) - .put("standard", StandardTokenizerFactory.class) - .put("thai", ThaiTokenizerFactory.class) + .put("pattern", PatternTokenizerFactory.class) + .put("simplepattern", MovedToAnalysisCommon.class) + .put("simplepatternsplit", MovedToAnalysisCommon.class) + .put("standard", StandardTokenizerFactory.class) + .put("thai", ThaiTokenizerFactory.class) .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class) - .put("whitespace", WhitespaceTokenizerFactory.class) + .put("whitespace", WhitespaceTokenizerFactory.class) // this one "seems to mess up offsets". probably shouldn't be a tokenizer... - .put("wikipedia", Void.class) - - // TODO: expose these - .put("simplepattern", Void.class) - .put("simplepatternsplit", Void.class) + .put("wikipedia", Void.class) .immutableMap(); static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>() |