summaryrefslogtreecommitdiff
path: root/test/framework
diff options
context:
space:
mode:
authorAndy Bristol <andy.bristol@elastic.co>2017-06-13 12:46:59 -0700
committerGitHub <noreply@github.com>2017-06-13 12:46:59 -0700
commit48696ab544e32245c806d798fe56a7afcffa0e47 (patch)
treefeba4e7b927b815909cf5e7d4800d1c3a6d23d07 /test/framework
parent190242fb1b9f1d42196a7b80a8627de81a301417 (diff)
expose simple pattern tokenizers (#25159)
Expose the experimental simplepattern and simplepatternsplit tokenizers in the common analysis plugin. They provide tokenization based on regular expressions, using Lucene's deterministic regex implementation that is usually faster than Java's and has protections against creating too-deep stacks during matching. Both have a not-very-useful default pattern of the empty string because all tokenizer factories must be able to be instantiated at index creation time. They should always be configured by the user in practice.
Diffstat (limited to 'test/framework')
-rw-r--r--test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java28
1 files changed, 13 insertions, 15 deletions
diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
index fd8a5e7cd9..a3fe52d005 100644
--- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java
@@ -129,25 +129,23 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
// exposed in ES
- .put("classic", ClassicTokenizerFactory.class)
- .put("edgengram", EdgeNGramTokenizerFactory.class)
- .put("keyword", KeywordTokenizerFactory.class)
- .put("letter", LetterTokenizerFactory.class)
- .put("lowercase", LowerCaseTokenizerFactory.class)
- .put("ngram", NGramTokenizerFactory.class)
+ .put("classic", ClassicTokenizerFactory.class)
+ .put("edgengram", EdgeNGramTokenizerFactory.class)
+ .put("keyword", KeywordTokenizerFactory.class)
+ .put("letter", LetterTokenizerFactory.class)
+ .put("lowercase", LowerCaseTokenizerFactory.class)
+ .put("ngram", NGramTokenizerFactory.class)
.put("pathhierarchy", PathHierarchyTokenizerFactory.class)
- .put("pattern", PatternTokenizerFactory.class)
- .put("standard", StandardTokenizerFactory.class)
- .put("thai", ThaiTokenizerFactory.class)
+ .put("pattern", PatternTokenizerFactory.class)
+ .put("simplepattern", MovedToAnalysisCommon.class)
+ .put("simplepatternsplit", MovedToAnalysisCommon.class)
+ .put("standard", StandardTokenizerFactory.class)
+ .put("thai", ThaiTokenizerFactory.class)
.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class)
- .put("whitespace", WhitespaceTokenizerFactory.class)
+ .put("whitespace", WhitespaceTokenizerFactory.class)
// this one "seems to mess up offsets". probably shouldn't be a tokenizer...
- .put("wikipedia", Void.class)
-
- // TODO: expose these
- .put("simplepattern", Void.class)
- .put("simplepatternsplit", Void.class)
+ .put("wikipedia", Void.class)
.immutableMap();
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()