expose simple pattern tokenizers (#25159)

Expose the experimental simplepattern and simplepatternsplit tokenizers in the common analysis plugin. They provide tokenization based on regular expressions, using Lucene's deterministic regex implementation that is usually faster than Java's and has protections against creating too-deep stacks during matching. Both have a not-very-useful default pattern of the empty string because all tokenizer factories must be able to be instantiated at index creation time. They should always be configured by the user in practice.
author: Andy Bristol <andy.bristol@elastic.co> 2017-06-13 12:46:59 -0700
committer: GitHub <noreply@github.com> 2017-06-13 12:46:59 -0700
commit: 48696ab544e32245c806d798fe56a7afcffa0e47 (patch)
tree: feba4e7b927b815909cf5e7d4800d1c3a6d23d07 /test/framework
parent: 190242fb1b9f1d42196a7b80a8627de81a301417 (diff)
1 files changed, 13 insertions, 15 deletions
diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
index fd8a5e7cd9..a3fe52d005 100644
--- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
@@ -129,25 +129,23 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
 
     static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
         // exposed in ES
-        .put("classic",       ClassicTokenizerFactory.class)
-        .put("edgengram",     EdgeNGramTokenizerFactory.class)
-        .put("keyword",       KeywordTokenizerFactory.class)
-        .put("letter",        LetterTokenizerFactory.class)
-        .put("lowercase",     LowerCaseTokenizerFactory.class)
-        .put("ngram",         NGramTokenizerFactory.class)
+        .put("classic", ClassicTokenizerFactory.class)
+        .put("edgengram", EdgeNGramTokenizerFactory.class)
+        .put("keyword", KeywordTokenizerFactory.class)
+        .put("letter", LetterTokenizerFactory.class)
+        .put("lowercase", LowerCaseTokenizerFactory.class)
+        .put("ngram", NGramTokenizerFactory.class)
         .put("pathhierarchy", PathHierarchyTokenizerFactory.class)
-        .put("pattern",       PatternTokenizerFactory.class)
-        .put("standard",      StandardTokenizerFactory.class)
-        .put("thai",          ThaiTokenizerFactory.class)
+        .put("pattern", PatternTokenizerFactory.class)
+        .put("simplepattern", MovedToAnalysisCommon.class)
+        .put("simplepatternsplit", MovedToAnalysisCommon.class)
+        .put("standard", StandardTokenizerFactory.class)
+        .put("thai", ThaiTokenizerFactory.class)
         .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class)
-        .put("whitespace",    WhitespaceTokenizerFactory.class)
+        .put("whitespace", WhitespaceTokenizerFactory.class)
 
         // this one "seems to mess up offsets". probably shouldn't be a tokenizer...
-        .put("wikipedia",     Void.class)
-
-        // TODO: expose these
-        .put("simplepattern",    Void.class)
-        .put("simplepatternsplit",    Void.class)
+        .put("wikipedia", Void.class)
         .immutableMap();
 
     static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
author	Andy Bristol <andy.bristol@elastic.co>	2017-06-13 12:46:59 -0700
committer	GitHub <noreply@github.com>	2017-06-13 12:46:59 -0700
commit	48696ab544e32245c806d798fe56a7afcffa0e47 (patch)
tree	feba4e7b927b815909cf5e7d4800d1c3a6d23d07 /test/framework
parent	190242fb1b9f1d42196a7b80a8627de81a301417 (diff)