summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNik Everett <nik9000@gmail.com>2017-04-19 18:51:34 -0400
committerGitHub <noreply@github.com>2017-04-19 18:51:34 -0400
commitcaf376c8af7ce9972a6e2aa7c232435e07416171 (patch)
tree583663a6df2e39cdfb5df7963ab94524d3fd2816
parent151a65ed17616f3e72d43a26dfb0c3c32504b78a (diff)
Start building analysis-common module (#23614)
Start moving built in analysis components into the new analysis-common module. The goal of this project is: 1. Remove core's dependency on lucene-analyzers-common.jar which should shrink the dependencies for transport client and high level rest client. 2. Prove that analysis plugins can do all the "built in" things by moving all "built in" behavior to a plugin. 3. Force tests not to depend on any oddball analyzer behavior. If tests need anything more than the standard analyzer they can use the mock analyzer provided by Lucene's test infrastructure.
-rw-r--r--buildSrc/src/main/resources/checkstyle_suppressions.xml8
-rw-r--r--core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java5
-rw-r--r--core/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java12
-rw-r--r--core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java6
-rw-r--r--core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FragmentBuilderHelper.java11
-rw-r--r--core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java94
-rw-r--r--core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java2
-rw-r--r--core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java90
-rw-r--r--core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java154
-rw-r--r--core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java27
-rw-r--r--modules/analysis-common/build.gradle23
-rw-r--r--modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java (renamed from core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java)18
-rw-r--r--modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java39
-rw-r--r--modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java (renamed from core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java)30
-rw-r--r--modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java (renamed from core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java)24
-rw-r--r--modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactoryTests.java (renamed from core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java)30
-rw-r--r--modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java (renamed from core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java)113
-rw-r--r--modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisClientYamlTestSuiteIT.java36
-rw-r--r--modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java86
-rw-r--r--modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java154
-rw-r--r--modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/QueryStringWithAnalyzersTests.java72
-rw-r--r--modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java (renamed from core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java)52
-rw-r--r--modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactoryTests.java (renamed from core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java)25
-rw-r--r--modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/10_basic.yaml11
-rw-r--r--modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yaml11
-rw-r--r--modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yaml27
-rw-r--r--modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml82
-rw-r--r--modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/50_char_filters.yaml13
-rw-r--r--plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java9
-rw-r--r--rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yaml125
-rw-r--r--settings.gradle5
-rw-r--r--test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java19
-rw-r--r--test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java (renamed from core/src/test/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java)16
33 files changed, 956 insertions, 473 deletions
diff --git a/buildSrc/src/main/resources/checkstyle_suppressions.xml b/buildSrc/src/main/resources/checkstyle_suppressions.xml
index 8c5aa12739..6e62b8ec34 100644
--- a/buildSrc/src/main/resources/checkstyle_suppressions.xml
+++ b/buildSrc/src/main/resources/checkstyle_suppressions.xml
@@ -1096,7 +1096,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]MergeSchedulerConfig.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SearchSlowLog.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionType.java" checks="LineLength" />
- <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractCharFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractIndexAnalyzerProvider.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractTokenFilterFactory.java" checks="LineLength" />
@@ -1225,8 +1224,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]UpperCaseTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceAnalyzerProvider.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceTokenizerFactory.java" checks="LineLength" />
- <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactory.java" checks="LineLength" />
- <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]AbstractCompoundWordTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]DictionaryCompoundWordTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
@@ -2686,11 +2683,8 @@
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SettingsListenerIT.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionTypeTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]WaitUntilRefreshIT.java" checks="LineLength" />
- <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisRegistryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTests.java" checks="LineLength" />
- <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTestsHelper.java" checks="LineLength" />
- <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]BaseWordDelimiterTokenFilterFactoryTestCase.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CJKFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CharFilterTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CompoundAnalysisTests.java" checks="LineLength" />
@@ -2709,8 +2703,6 @@
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopAnalyzerTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopTokenFilterTests.java" checks="LineLength" />
- <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactoryTests.java" checks="LineLength" />
- <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]commongrams[/\\]CommonGramsTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]filter1[/\\]MyFilterTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]synonyms[/\\]SynonymsAnalysisTests.java" checks="LineLength" />
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java
index 9d287d90c8..1d3b8e296e 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java
@@ -71,4 +71,9 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
return result;
}
+
+ @Override
+ public boolean breaksFastVectorHighlighter() {
+ return true;
+ }
} \ No newline at end of file
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java
index 8c976646b8..c90138d7a2 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java
+++ b/core/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java
@@ -20,10 +20,20 @@
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
public interface TokenFilterFactory {
-
String name();
TokenStream create(TokenStream tokenStream);
+
+ /**
+ * Does this analyzer mess up the {@link OffsetAttribute}s in such as way as to break the
+ * {@link FastVectorHighlighter}? If this is {@code true} then the
+ * {@linkplain FastVectorHighlighter} will attempt to work around the broken offsets.
+ */
+ default boolean breaksFastVectorHighlighter() {
+ return false;
+ }
}
diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
index 61950942e6..c494c4cae9 100644
--- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
+++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java
@@ -25,7 +25,6 @@ import org.elasticsearch.common.NamedRegistry;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
-import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
@@ -140,8 +139,6 @@ import org.elasticsearch.index.analysis.UniqueTokenFilterFactory;
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
-import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory;
-import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.plugins.AnalysisPlugin;
@@ -205,7 +202,6 @@ public final class AnalysisModule {
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
tokenFilters.register("stop", StopTokenFilterFactory::new);
tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
- tokenFilters.register("asciifolding", ASCIIFoldingTokenFilterFactory::new);
tokenFilters.register("length", LengthTokenFilterFactory::new);
tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new);
tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new);
@@ -225,8 +221,6 @@ public final class AnalysisModule {
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
- tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
- tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);
diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FragmentBuilderHelper.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FragmentBuilderHelper.java
index ac0dab3a63..37971e6b48 100644
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FragmentBuilderHelper.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FragmentBuilderHelper.java
@@ -26,15 +26,9 @@ import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.util.CollectionUtil;
-import org.apache.lucene.util.Version;
import org.elasticsearch.index.analysis.CustomAnalyzer;
-import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory;
-import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
-import org.elasticsearch.index.analysis.NGramTokenFilterFactory;
-import org.elasticsearch.index.analysis.NGramTokenizerFactory;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
-import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.mapper.FieldMapper;
import java.util.Comparator;
@@ -56,7 +50,7 @@ public final class FragmentBuilderHelper {
public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) {
assert fragInfo != null : "FragInfo must not be null";
assert mapper.fieldType().name().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name();
- if (!fragInfo.getSubInfos().isEmpty() && (containsBrokenAnalysis(mapper.fieldType().indexAnalyzer()))) {
+ if (!fragInfo.getSubInfos().isEmpty() && containsBrokenAnalysis(mapper.fieldType().indexAnalyzer())) {
/* This is a special case where broken analysis like WDF is used for term-vector creation at index-time
* which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort
* the fragments based on their offsets rather than using soley the positions as it is done in
@@ -91,8 +85,7 @@ public final class FragmentBuilderHelper {
final CustomAnalyzer a = (CustomAnalyzer) analyzer;
TokenFilterFactory[] tokenFilters = a.tokenFilters();
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
- if (tokenFilterFactory instanceof WordDelimiterTokenFilterFactory
- || tokenFilterFactory instanceof EdgeNGramTokenFilterFactory) {
+ if (tokenFilterFactory.breaksFastVectorHighlighter()) {
return true;
}
}
diff --git a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java
index bcd7bba8d3..57a83b2c68 100644
--- a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java
+++ b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java
@@ -18,6 +18,8 @@
*/
package org.elasticsearch.action.admin.indices;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
@@ -27,18 +29,28 @@ import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.IndexAnalyzers;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.mapper.AllFieldMapper;
import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
+import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import java.io.IOException;
import java.util.List;
+import java.util.Map;
-import static java.util.Collections.emptyList;
+import static java.util.Collections.singletonList;
+import static java.util.Collections.singletonMap;
+/**
+ * Tests for {@link TransportAnalyzeAction}. See the more "intense" version of this test in the
+ * {@code common-analysis} module.
+ */
public class TransportAnalyzeActionTests extends ESTestCase {
private IndexAnalyzers indexAnalyzers;
@@ -53,23 +65,28 @@ public class TransportAnalyzeActionTests extends ESTestCase {
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
- .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
- .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
- .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
- .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
- .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
- .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
- .put("index.analysis.tokenizer.trigram.type", "ngram")
- .put("index.analysis.tokenizer.trigram.min_gram", 3)
- .put("index.analysis.tokenizer.trigram.max_gram", 3)
- .put("index.analysis.filter.synonym.type", "synonym")
- .putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay")
- .put("index.analysis.filter.synonym.tokenizer", "trigram")
- .put("index.analysis.filter.synonym.min_gram", 3)
- .put("index.analysis.filter.synonym.max_gram", 3).build();
+ .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
+ .put("index.analysis.analyzer.custom_analyzer.filter", "mock").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
environment = new Environment(settings);
- registry = new AnalysisModule(environment, emptyList()).getAnalysisRegistry();
+ AnalysisPlugin plugin = new AnalysisPlugin() {
+ class MockFactory extends AbstractTokenFilterFactory {
+ MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+ super(indexSettings, name, settings);
+ }
+
+ @Override
+ public TokenStream create(TokenStream tokenStream) {
+ return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
+ }
+ }
+
+ @Override
+ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
+ return singletonMap("mock", MockFactory::new);
+ }
+ };
+ registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
indexAnalyzers = registry.build(idxSettings);
}
@@ -143,51 +160,44 @@ public class TransportAnalyzeActionTests extends ESTestCase {
}
public void testWithIndexAnalyzers() throws IOException {
-
AnalyzeRequest request = new AnalyzeRequest();
- request.analyzer("standard");
request.text("the quick brown fox");
request.analyzer("custom_analyzer");
- request.text("the qu1ck brown fox");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
- assertEquals(4, tokens.size());
+ assertEquals(3, tokens.size());
+ assertEquals("quick", tokens.get(0).getTerm());
+ assertEquals("brown", tokens.get(1).getTerm());
+ assertEquals("fox", tokens.get(2).getTerm());
- request.analyzer("whitespace");
- request.text("the qu1ck brown fox-dog");
+ request.analyzer("standard");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
assertEquals(4, tokens.size());
+ assertEquals("the", tokens.get(0).getTerm());
+ assertEquals("quick", tokens.get(1).getTerm());
+ assertEquals("brown", tokens.get(2).getTerm());
+ assertEquals("fox", tokens.get(3).getTerm());
- request.analyzer("custom_analyzer");
- request.text("the qu1ck brown fox-dog");
- analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
- tokens = analyze.getTokens();
- assertEquals(5, tokens.size());
-
+ // Switch the analyzer out for just a tokenizer
request.analyzer(null);
- request.tokenizer("whitespace");
- request.addTokenFilter("lowercase");
- request.addTokenFilter("wordDelimiter");
- request.text("the qu1ck brown fox-dog");
+ request.tokenizer("standard");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
- assertEquals(5, tokens.size());
+ assertEquals(4, tokens.size());
assertEquals("the", tokens.get(0).getTerm());
- assertEquals("qu1ck", tokens.get(1).getTerm());
+ assertEquals("quick", tokens.get(1).getTerm());
assertEquals("brown", tokens.get(2).getTerm());
assertEquals("fox", tokens.get(3).getTerm());
- assertEquals("dog", tokens.get(4).getTerm());
- request.analyzer(null);
- request.tokenizer("trigram");
- request.addTokenFilter("synonym");
- request.text("kimchy");
+ // Now try applying our token filter
+ request.addTokenFilter("mock");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
- assertEquals(2, tokens.size());
- assertEquals("sha", tokens.get(0).getTerm());
- assertEquals("hay", tokens.get(1).getTerm());
+ assertEquals(3, tokens.size());
+ assertEquals("quick", tokens.get(0).getTerm());
+ assertEquals("brown", tokens.get(1).getTerm());
+ assertEquals("fox", tokens.get(2).getTerm());
}
public void testGetIndexAnalyserWithoutIndexAnalyzers() throws IOException {
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java
index 6893fda75b..0a62e8c491 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java
@@ -22,5 +22,5 @@ package org.elasticsearch.index.analysis;
import org.elasticsearch.AnalysisFactoryTestCase;
public class AnalysisFactoryTests extends AnalysisFactoryTestCase {
- // tests are inherited
+ // tests are inherited and nothing needs to be defined here
}
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java
index 12071f0eac..0edd2fbe2c 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java
+++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java
@@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -31,17 +32,20 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
+import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
-import static java.util.Collections.emptyList;
import static java.util.Collections.emptyMap;
+import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
@@ -112,51 +116,73 @@ public class AnalysisRegistryTests extends ESTestCase {
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
}
+ /**
+ * Tests that {@code camelCase} filter names and {@code snake_case} filter names don't collide.
+ */
public void testConfigureCamelCaseTokenFilter() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
- .put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
- .put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
- .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
- .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
- .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
- .putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter").build();
+ .put("index.analysis.filter.testFilter.type", "mock")
+ .put("index.analysis.filter.test_filter.type", "mock")
+ .put("index.analysis.analyzer.custom_analyzer_with_camel_case.tokenizer", "standard")
+ .putArray("index.analysis.analyzer.custom_analyzer_with_camel_case.filter", "lowercase", "testFilter")
+ .put("index.analysis.analyzer.custom_analyzer_with_snake_case.tokenizer", "standard")
+ .putArray("index.analysis.analyzer.custom_analyzer_with_snake_case.filter", "lowercase", "test_filter").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
- IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry()
+ /* The snake_case version of the name should not filter out any stopwords while the
+ * camelCase version will filter out English stopwords. */
+ AnalysisPlugin plugin = new AnalysisPlugin() {
+ class MockFactory extends AbstractTokenFilterFactory {
+ MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+ super(indexSettings, name, settings);
+ }
+
+ @Override
+ public TokenStream create(TokenStream tokenStream) {
+ if (name().equals("test_filter")) {
+ return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
+ }
+ return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
+ }
+ }
+
+ @Override
+ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
+ return singletonMap("mock", MockFactory::new);
+ }
+ };
+ IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), singletonList(plugin)).getAnalysisRegistry()
.build(idxSettings);
- try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
+
+ // This shouldn't contain English stopwords
+ try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_camel_case")) {
assertNotNull(custom_analyser);
- TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
+ TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
- List<String> token = new ArrayList<>();
- while(tokenStream.incrementToken()) {
- token.add(charTermAttribute.toString());
- }
- assertEquals(token.toString(), 2, token.size());
- assertEquals("j2se", token.get(0));
- assertEquals("j2ee", token.get(1));
+ assertTrue(tokenStream.incrementToken());
+ assertEquals("has", charTermAttribute.toString());
+ assertTrue(tokenStream.incrementToken());
+ assertEquals("foo", charTermAttribute.toString());
+ assertFalse(tokenStream.incrementToken());
}
- try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
+ // This *should* contain English stopwords
+ try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_snake_case")) {
assertNotNull(custom_analyser);
- TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
+ TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
- List<String> token = new ArrayList<>();
- while(tokenStream.incrementToken()) {
- token.add(charTermAttribute.toString());
- }
- assertEquals(token.toString(), 6, token.size());
- assertEquals("j", token.get(0));
- assertEquals("2", token.get(1));
- assertEquals("se", token.get(2));
- assertEquals("j", token.get(3));
- assertEquals("2", token.get(4));
- assertEquals("ee", token.get(5));
+ assertTrue(tokenStream.incrementToken());
+ assertEquals("has", charTermAttribute.toString());
+ assertTrue(tokenStream.incrementToken());
+ assertEquals("a", charTermAttribute.toString());
+ assertTrue(tokenStream.incrementToken());
+ assertEquals("foo", charTermAttribute.toString());
+ assertFalse(tokenStream.incrementToken());
}
}
diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
index c0c52928d2..819b2c7d64 100644
--- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
+++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java
@@ -19,6 +19,7 @@
package org.elasticsearch.search.fetch.subphase.highlight;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+
import org.apache.lucene.search.join.ScoreMode;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.search.SearchRequestBuilder;
@@ -100,6 +101,7 @@ import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.startsWith;
public class HighlighterSearchIT extends ESIntegTestCase {
+ // TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests
private static final String[] ALL_TYPES = new String[] {"plain", "postings", "fvh", "unified"};
private static final String[] UNIFIED_AND_NULL = new String[] {null, "unified"};
@@ -113,12 +115,11 @@ public class HighlighterSearchIT extends ESIntegTestCase {
mappings.startObject();
mappings.startObject("type")
.startObject("properties")
- .startObject("text")
- .field("type", "keyword")
- .field("store", true)
- .endObject()
- .endObject()
- .endObject();
+ .startObject("text")
+ .field("type", "keyword")
+ .field("store", true)
+ .endObject()
+ .endObject().endObject();
mappings.endObject();
assertAcked(prepareCreate("test")
.addMapping("type", mappings));
@@ -139,14 +140,13 @@ public class HighlighterSearchIT extends ESIntegTestCase {
mappings.startObject();
mappings.startObject("type")
.startObject("properties")
- .startObject("text")
- .field("type", "text")
- .field("analyzer", "keyword")
- .field("index_options", "offsets")
- .field("term_vector", "with_positions_offsets")
- .endObject()
- .endObject()
- .endObject();
+ .startObject("text")
+ .field("type", "text")
+ .field("analyzer", "keyword")
+ .field("index_options", "offsets")
+ .field("term_vector", "with_positions_offsets")
+ .endObject()
+ .endObject().endObject();
mappings.endObject();
assertAcked(prepareCreate("test")
.addMapping("type", mappings));
@@ -166,23 +166,22 @@ public class HighlighterSearchIT extends ESIntegTestCase {
mappings.startObject();
mappings.startObject("type")
.startObject("_source")
- .field("enabled", false)
+ .field("enabled", false)
.endObject()
.startObject("properties")
- .startObject("unstored_field")
- .field("index_options", "offsets")
- .field("term_vector", "with_positions_offsets")
- .field("type", "text")
- .field("store", false)
- .endObject()
- .startObject("text")
- .field("index_options", "offsets")
- .field("term_vector", "with_positions_offsets")
- .field("type", "text")
- .field("store", true)
- .endObject()
- .endObject()
- .endObject();
+ .startObject("unstored_field")
+ .field("index_options", "offsets")
+ .field("term_vector", "with_positions_offsets")
+ .field("type", "text")
+ .field("store", false)
+ .endObject()
+ .startObject("text")
+ .field("index_options", "offsets")
+ .field("term_vector", "with_positions_offsets")
+ .field("type", "text")
+ .field("store", true)
+ .endObject()
+ .endObject().endObject();
mappings.endObject();
assertAcked(prepareCreate("test")
.addMapping("type", mappings));
@@ -218,103 +217,6 @@ public class HighlighterSearchIT extends ESIntegTestCase {
assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>"));
}
- public void testNgramHighlightingWithBrokenPositions() throws IOException {
- assertAcked(prepareCreate("test")
- .addMapping("test", jsonBuilder()
- .startObject()
- .startObject("test")
- .startObject("properties")
- .startObject("name")
- .startObject("fields")
- .startObject("autocomplete")
- .field("type", "text")
- .field("analyzer", "autocomplete")
- .field("search_analyzer", "search_autocomplete")
- .field("term_vector", "with_positions_offsets")
- .endObject()
- .endObject()
- .field("type", "text")
- .endObject()
- .endObject()
- .endObject()
- .endObject())
- .setSettings(Settings.builder()
- .put(indexSettings())
- .put("analysis.tokenizer.autocomplete.max_gram", 20)
- .put("analysis.tokenizer.autocomplete.min_gram", 1)
- .put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
- .put("analysis.tokenizer.autocomplete.type", "nGram")
- .put("analysis.filter.wordDelimiter.type", "word_delimiter")
- .putArray("analysis.filter.wordDelimiter.type_table",
- "& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
- "? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM", "# => ALPHANUM", "% => ALPHANUM",
- "+ => ALPHANUM", ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM", "/ => ALPHANUM",
- "^ => ALPHANUM", "$ => ALPHANUM", "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
- "] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM", "{ => ALPHANUM")
-
- .put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
- .put("analysis.filter.wordDelimiter.generate_word_parts", true)
- .put("analysis.filter.wordDelimiter.generate_number_parts", false)
- .put("analysis.filter.wordDelimiter.catenate_words", true)
- .put("analysis.filter.wordDelimiter.catenate_numbers", true)
- .put("analysis.filter.wordDelimiter.catenate_all", false)
-
- .put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
- .putArray("analysis.analyzer.autocomplete.filter", "lowercase", "wordDelimiter")
- .put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
- .putArray("analysis.analyzer.search_autocomplete.filter", "lowercase", "wordDelimiter")));
- client().prepareIndex("test", "test", "1")
- .setSource("name", "ARCOTEL Hotels Deutschland").get();
- refresh();
- SearchResponse search = client().prepareSearch("test").setTypes("test")
- .setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
- .highlighter(new HighlightBuilder().field("name.autocomplete")).execute().actionGet();
- assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
- }
-
- public void testMultiPhraseCutoff() throws IOException {
- /*
- * MultiPhraseQuery can literally kill an entire node if there are too many terms in the
- * query. We cut off and extract terms if there are more than 16 terms in the query
- */
- assertAcked(prepareCreate("test")
- .addMapping("test",
- "body", "type=text,analyzer=custom_analyzer,search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
- .setSettings(
- Settings.builder().put(indexSettings())
- .put("analysis.filter.wordDelimiter.type", "word_delimiter")
- .put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
- .put("analysis.filter.wordDelimiter.generate_word_parts", true)
- .put("analysis.filter.wordDelimiter.generate_number_parts", true)
- .put("analysis.filter.wordDelimiter.catenate_words", true)
- .put("analysis.filter.wordDelimiter.catenate_numbers", true)
- .put("analysis.filter.wordDelimiter.catenate_all", false)
- .put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
- .putArray("analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter"))
- );
-
- ensureGreen();
- client().prepareIndex("test", "test", "1")
- .setSource("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
- + "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
- + "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
- + "http://twitter.com this is a test for highlighting feature")
- .get();
- refresh();
- SearchResponse search = client().prepareSearch().setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
- .highlighter(new HighlightBuilder().field("body")).execute().actionGet();
- assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
- search = client()
- .prepareSearch()
- .setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
- + "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
- + "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
- + "http://twitter.com this is a test for highlighting feature"))
- .highlighter(new HighlightBuilder().field("body")).execute().actionGet();
- assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: <em>http://www.facebook.com</em> "
- + "<em>http://elasticsearch.org</em> <em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
- }
-
public void testNgramHighlighting() throws IOException {
assertAcked(prepareCreate("test")
.addMapping("test",
diff --git a/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java b/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java
index 05dc973f9e..a30049c70d 100644
--- a/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java
+++ b/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java
@@ -1605,33 +1605,6 @@ public class SearchQueryIT extends ESIntegTestCase {
assertHitCount(searchResponse, 2);
}
- // see #3898
- public void testCustomWordDelimiterQueryString() {
- assertAcked(client().admin().indices().prepareCreate("test")
- .setSettings("analysis.analyzer.my_analyzer.type", "custom",
- "analysis.analyzer.my_analyzer.tokenizer", "whitespace",
- "analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
- "analysis.filter.custom_word_delimiter.type", "word_delimiter",
- "analysis.filter.custom_word_delimiter.generate_word_parts", "true",
- "analysis.filter.custom_word_delimiter.generate_number_parts", "false",
- "analysis.filter.custom_word_delimiter.catenate_numbers", "true",
- "analysis.filter.custom_word_delimiter.catenate_words", "false",
- "analysis.filter.custom_word_delimiter.split_on_case_change", "false",
- "analysis.filter.custom_word_delimiter.split_on_numerics", "false",
- "analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
- .addMapping("type1", "field1", "type=text,analyzer=my_analyzer", "field2", "type=text,analyzer=my_analyzer"));
-
- client().prepareIndex("test", "type1", "1").setSource("field1", "foo bar baz", "field2", "not needed").get();
- refresh();
-
- SearchResponse response = client()
- .prepareSearch("test")
- .setQuery(
- queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
- .field("field1").field("field2")).get();
- assertHitCount(response, 1L);
- }
-
// see #3797
public void testMultiMatchLenientIssue3797() {
createIndex("test");
diff --git a/modules/analysis-common/build.gradle b/modules/analysis-common/build.gradle
new file mode 100644
index 0000000000..391b74934c
--- /dev/null
+++ b/modules/analysis-common/build.gradle
@@ -0,0 +1,23 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+esplugin {
+ description 'Adds "built in" analyzers to Elasticsearch.'
+ classname 'org.elasticsearch.analysis.common.CommonAnalysisPlugin'
+}
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java
index 5e53a86129..f8e0c7383a 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
@@ -25,20 +25,26 @@ import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.MultiTermAwareComponent;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
/**
* Factory for ASCIIFoldingFilter.
*/
-public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
- public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
- public static boolean DEFAULT_PRESERVE_ORIGINAL = false;
+public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
+ implements MultiTermAwareComponent {
+ public static final ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
+ public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
private final boolean preserveOriginal;
- public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+ public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment,
+ String name, Settings settings) {
super(indexSettings, name, settings);
preserveOriginal = settings.getAsBooleanLenientForPreEs6Indices(
- indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL, deprecationLogger);
+ indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(),
+ DEFAULT_PRESERVE_ORIGINAL, deprecationLogger);
}
@Override
diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
new file mode 100644
index 0000000000..bfd1bbdcc9
--- /dev/null
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
+import org.elasticsearch.plugins.AnalysisPlugin;
+import org.elasticsearch.plugins.Plugin;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
+ @Override
+ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
+ Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
+ filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
+ filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
+ filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
+ return filters;
+ }
+}
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java
index 7cdc215f1b..1613339853 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
@@ -26,20 +26,22 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.Analysis;
import java.util.List;
import java.util.Set;
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_ALL;
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_NUMBERS;
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_WORDS;
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_NUMBER_PARTS;
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_WORD_PARTS;
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.PRESERVE_ORIGINAL;
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_CASE_CHANGE;
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_NUMERICS;
-import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE;
-import static org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory.parseTypes;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_ALL;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_NUMBERS;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
+import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE;
+import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.parseTypes;
public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory {
@@ -47,7 +49,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
private final int flags;
private final CharArraySet protoWords;
- public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+ public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
+ String name, Settings settings) {
super(indexSettings, name, settings);
// Sample Format for the type table:
@@ -82,7 +85,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited
- Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words");
+ Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
+ settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags;
}
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java
index 09882072ee..8c38beb8f8 100644
--- a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java
+++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java
@@ -17,7 +17,7 @@
* under the License.
*/
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
@@ -26,6 +26,8 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.Analysis;
import java.util.Collection;
import java.util.List;
@@ -52,7 +54,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
private final int flags;
private final CharArraySet protoWords;
- public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+ public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env,
+ String name, Settings settings) {
super(indexSettings, name, settings);
// Sample Format for the type table:
@@ -87,7 +90,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited
- Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words");
+ Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
+ settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags;
}
@@ -101,7 +105,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
}
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
- if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), key, defaultValue, deprecationLogger)) {
+ if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(),
+ key, defaultValue, deprecationLogger)) {
return flag;
}
return 0;
@@ -122,14 +127,16 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
String lhs = parseString(m.group(1).trim());
Byte rhs = parseType(m.group(2).trim());
if (lhs.length() != 1)
- throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
+ throw new RuntimeException("Invalid Mapping Rule : ["
+ + rule + "]. Only a single character is allowed.");
if (rhs == null)
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
typeMap.put(lhs.charAt(0), rhs);
}
// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
- byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
+ byte types[] = new byte[Math.max(
+ typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
for (int i = 0; i < types.length; i++)
types[i] = WordDelimiterIterator.getType(i);
for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
@@ -196,4 +203,9 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
}
return new String(out, 0, writePos);
}
+
+ @Override
+ public boolean breaksFastVectorHighlighter() {
+ return true;
+ }
}
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactoryTests.java
index 973225df18..22ac081011 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactoryTests.java
@@ -17,12 +17,15 @@
* under the License.
*/
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisTestsHelper;
+import org.elasticsearch.index.analysis.MultiTermAwareComponent;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
@@ -31,10 +34,12 @@ import java.io.StringReader;
public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
String source = "Ansprüche";
String[] expected = new String[]{"Anspruche"};
@@ -44,11 +49,13 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
}
public void testPreserveOriginal() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
- .put("index.analysis.filter.my_ascii_folding.preserve_original", true)
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
+ .put("index.analysis.filter.my_ascii_folding.preserve_original", true)
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
String source = "Ansprüche";
String[] expected = new String[]{"Anspruche", "Ansprüche"};
@@ -57,7 +64,8 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
// but the multi-term aware component still emits a single token
- tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent();
+ tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter)
+ .getMultiTermComponent();
tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
expected = new String[]{"Anspruche"};
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java
index 713e942475..ce6d0403c0 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java
@@ -16,13 +16,15 @@
* specific language governing permissions and limitations
* under the License.
*/
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisTestsHelper;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
@@ -30,7 +32,8 @@ import java.io.IOException;
import java.io.StringReader;
/**
- * Base class to test {@link WordDelimiterTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory}
+ * Base class to test {@link WordDelimiterTokenFilterFactory} and
+ * {@link WordDelimiterGraphTokenFilterFactory}.
*/
public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
final String type;
@@ -40,10 +43,12 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
}
public void testDefault() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
@@ -54,44 +59,51 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
}
public void testCatenateWords() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
- .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+ .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
- String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
+ String[] expected = new String[] { "PowerShot", "500", "42", "wifi", "wifi", "4000", "j",
+ "2", "se", "ONeil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateNumbers() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
- .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
+ .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
- String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
- "se", "O", "Neil"};
+ String[] expected = new String[] { "Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000",
+ "j", "2", "se", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateAll() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
- .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
- .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
+ .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
+ .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
@@ -101,11 +113,13 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
}
public void testSplitOnCaseChange() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[]{"PowerShot"};
@@ -115,30 +129,35 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
}
public void testPreserveOriginal() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
- String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi",
- "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
+ String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42",
+ "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se",
+ "O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testStemEnglishPossessive() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
- String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
- "se", "O", "Neil", "s"};
+ String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi",
+ "4000", "j", "2", "se", "O", "Neil", "s" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisClientYamlTestSuiteIT.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisClientYamlTestSuiteIT.java
new file mode 100644
index 0000000000..b5d6bf23b3
--- /dev/null
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisClientYamlTestSuiteIT.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.analysis.common;
+
+import com.carrotsearch.randomizedtesting.annotations.Name;
+import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
+
+import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
+import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
+
+public class CommonAnalysisClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
+ public CommonAnalysisClientYamlTestSuiteIT(@Name("yaml")ClientYamlTestCandidate testCandidate) {
+ super(testCandidate);
+ }
+
+ @ParametersFactory
+ public static Iterable<Object[]> parameters() throws Exception {
+ return ESClientYamlSuiteTestCase.createParameters();
+ }
+}
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
new file mode 100644
index 0000000000..886dad37b5
--- /dev/null
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.elasticsearch.AnalysisFactoryTestCase;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static java.util.Collections.emptyList;
+import static java.util.stream.Collectors.toList;
+
+public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
+ @Override
+ protected Map<String, Class<?>> getTokenizers() {
+ Map<String, Class<?>> tokenizers = new HashMap<>(super.getTokenizers());
+ return tokenizers;
+ }
+
+ @Override
+ protected Map<String, Class<?>> getTokenFilters() {
+ Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
+ filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class);
+ filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class);
+ filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class);
+ return filters;
+ }
+
+ @Override
+ protected Map<String, Class<?>> getCharFilters() {
+ Map<String, Class<?>> filters = new HashMap<>(super.getCharFilters());
+ return filters;
+ }
+
+ /**
+ * Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but
+ * hasn't been marked in this class with its proper factory.
+ */
+ public void testAllTokenizersMarked() {
+ markedTestCase("char filter", getTokenizers());
+ }
+
+ /**
+ * Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
+ * hasn't been marked in this class with its proper factory.
+ */
+ public void testAllCharFiltersMarked() {
+ markedTestCase("char filter", getCharFilters());
+ }
+
+ /**
+ * Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
+ * hasn't been marked in this class with its proper factory.
+ */
+ public void testAllTokenFiltersMarked() {
+ markedTestCase("token filter", getTokenFilters());
+ }
+
+ private void markedTestCase(String name, Map<String, Class<?>> map) {
+ List<String> unmarked = map.entrySet().stream()
+ .filter(e -> e.getValue() == MovedToAnalysisCommon.class)
+ .map(Map.Entry::getKey)
+ .sorted()
+ .collect(toList());
+ assertEquals(name + " marked in AnalysisFactoryTestCase as moved to analysis-common "
+ + "but not mapped here", emptyList(), unmarked);
+ }
+}
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java
new file mode 100644
index 0000000000..c022d5c85a
--- /dev/null
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.query.Operator;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
+import org.elasticsearch.test.ESIntegTestCase;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+
+import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
+import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
+import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.startsWith;
+
+public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
+ @Override
+ protected Collection<Class<? extends Plugin>> nodePlugins() {
+ return Arrays.asList(CommonAnalysisPlugin.class);
+ }
+
+ public void testNgramHighlightingWithBrokenPositions() throws IOException {
+ assertAcked(prepareCreate("test")
+ .addMapping("test", jsonBuilder()
+ .startObject()
+ .startObject("test")
+ .startObject("properties")
+ .startObject("name")
+ .field("type", "text")
+ .startObject("fields")
+ .startObject("autocomplete")
+ .field("type", "text")
+ .field("analyzer", "autocomplete")
+ .field("search_analyzer", "search_autocomplete")
+ .field("term_vector", "with_positions_offsets")
+ .endObject()
+ .endObject()
+ .endObject()
+ .endObject()
+ .endObject()
+ .endObject())
+ .setSettings(Settings.builder()
+ .put(indexSettings())
+ .put("analysis.tokenizer.autocomplete.max_gram", 20)
+ .put("analysis.tokenizer.autocomplete.min_gram", 1)
+ .put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
+ .put("analysis.tokenizer.autocomplete.type", "nGram")
+ .put("analysis.filter.wordDelimiter.type", "word_delimiter")
+ .putArray("analysis.filter.wordDelimiter.type_table",
+ "& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
+ "? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM",
+ "# => ALPHANUM", "% => ALPHANUM", "+ => ALPHANUM",
+ ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM",
+ "/ => ALPHANUM", "^ => ALPHANUM", "$ => ALPHANUM",
+ "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
+ "] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM",
+ "{ => ALPHANUM")
+ .put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
+ .put("analysis.filter.wordDelimiter.generate_word_parts", true)
+ .put("analysis.filter.wordDelimiter.generate_number_parts", false)
+ .put("analysis.filter.wordDelimiter.catenate_words", true)
+ .put("analysis.filter.wordDelimiter.catenate_numbers", true)
+ .put("analysis.filter.wordDelimiter.catenate_all", false)
+
+ .put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
+ .putArray("analysis.analyzer.autocomplete.filter",
+ "lowercase", "wordDelimiter")
+ .put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
+ .putArray("analysis.analyzer.search_autocomplete.filter",
+ "lowercase", "wordDelimiter")));
+ client().prepareIndex("test", "test", "1")
+ .setSource("name", "ARCOTEL Hotels Deutschland").get();
+ refresh();
+ SearchResponse search = client().prepareSearch("test").setTypes("test")
+ .setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
+ .highlighter(new HighlightBuilder().field("name.autocomplete")).get();
+ assertHighlight(search, 0, "name.autocomplete", 0,
+ equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
+ }
+
+ public void testMultiPhraseCutoff() throws IOException {
+ /*
+ * MultiPhraseQuery can literally kill an entire node if there are too many terms in the
+ * query. We cut off and extract terms if there are more than 16 terms in the query
+ */
+ assertAcked(prepareCreate("test")
+ .addMapping("test", "body", "type=text,analyzer=custom_analyzer,"
+ + "search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
+ .setSettings(
+ Settings.builder().put(indexSettings())
+ .put("analysis.filter.wordDelimiter.type", "word_delimiter")
+ .put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
+ .put("analysis.filter.wordDelimiter.generate_word_parts", true)
+ .put("analysis.filter.wordDelimiter.generate_number_parts", true)
+ .put("analysis.filter.wordDelimiter.catenate_words", true)
+ .put("analysis.filter.wordDelimiter.catenate_numbers", true)
+ .put("analysis.filter.wordDelimiter.catenate_all", false)
+ .put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
+ .putArray("analysis.analyzer.custom_analyzer.filter",
+ "lowercase", "wordDelimiter"))
+ );
+
+ ensureGreen();
+ client().prepareIndex("test", "test", "1")
+ .setSource("body", "Test: http://www.facebook.com http://elasticsearch.org "
+ + "http://xing.com http://cnn.com http://quora.com http://twitter.com this is "
+ + "a test for highlighting feature Test: http://www.facebook.com "
+ + "http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
+ + "http://twitter.com this is a test for highlighting feature")
+ .get();
+ refresh();
+ SearchResponse search = client().prepareSearch()
+ .setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
+ .highlighter(new HighlightBuilder().field("body")).get();
+ assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
+ search = client()
+ .prepareSearch()
+ .setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "
+ + "http://elasticsearch.org http://xing.com http://cnn.com "
+ + "http://quora.com http://twitter.com this is a test for highlighting "
+ + "feature Test: http://www.facebook.com http://elasticsearch.org "
+ + "http://xing.com http://cnn.com http://quora.com http://twitter.com this "
+ + "is a test for highlighting feature"))
+ .highlighter(new HighlightBuilder().field("body")).execute().actionGet();
+ assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: "
+ + "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
+ + "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
+ }
+}
diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/QueryStringWithAnalyzersTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/QueryStringWithAnalyzersTests.java
new file mode 100644
index 0000000000..7dd53a0449
--- /dev/null
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/QueryStringWithAnalyzersTests.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.index.query.Operator;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.test.ESIntegTestCase;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
+
+public class QueryStringWithAnalyzersTests extends ESIntegTestCase {
+ @Override
+ protected Collection<Class<? extends Plugin>> nodePlugins() {
+ return Arrays.asList(CommonAnalysisPlugin.class);
+ }
+
+ /**
+ * Validates that we properly split fields using the word delimiter filter in query_string.
+ */
+ public void testCustomWordDelimiterQueryString() {
+ assertAcked(client().admin().indices().prepareCreate("test")
+ .setSettings("analysis.analyzer.my_analyzer.type", "custom",
+ "analysis.analyzer.my_analyzer.tokenizer", "whitespace",
+ "analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
+ "analysis.filter.custom_word_delimiter.type", "word_delimiter",
+ "analysis.filter.custom_word_delimiter.generate_word_parts", "true",
+ "analysis.filter.custom_word_delimiter.generate_number_parts", "false",
+ "analysis.filter.custom_word_delimiter.catenate_numbers", "true",
+ "analysis.filter.custom_word_delimiter.catenate_words", "false",
+ "analysis.filter.custom_word_delimiter.split_on_case_change", "false",
+ "analysis.filter.custom_word_delimiter.split_on_numerics", "false",
+ "analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
+ .addMapping("type1",
+ "field1", "type=text,analyzer=my_analyzer",
+ "field2", "type=text,analyzer=my_analyzer"));
+
+ client().prepareIndex("test", "type1", "1").setSource(
+ "field1", "foo bar baz",
+ "field2", "not needed").get();
+ refresh();
+
+ SearchResponse response = client()
+ .prepareSearch("test")
+ .setQuery(
+ queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
+ .field("field1").field("field2")).get();
+ assertHitCount(response, 1L);
+ }
+}
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java
index 2ae4267104..bd7ff2f0c0 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java
@@ -16,52 +16,62 @@
* specific language governing permissions and limitations
* under the License.
*/
-package org.elasticsearch.index.analysis;
-
+package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisTestsHelper;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
import java.io.StringReader;
-public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
+public class WordDelimiterGraphTokenFilterFactoryTests
+ extends BaseWordDelimiterTokenFilterFactoryTestCase {
public WordDelimiterGraphTokenFilterFactoryTests() {
super("word_delimiter_graph");
}
public void testMultiTerms() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
- .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
+ .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
- String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
- "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
- "ONeil", "O'Neil's", "O", "Neil" };
+ String[] expected = new String[] { "PowerShot", "PowerShot", "Power", "Shot", "50042",
+ "500-42", "500", "42", "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi",
+ "fi", "4000", "j2se", "j2se", "j", "2", "se", "ONeil", "O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
- int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
- int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
+ int[] expectedIncr = new int[] { 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
+ 1, 1, 1, 0, 0, 1 };
+ int[] expectedPosLen = new int[] { 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3,
+ 1, 1, 1, 2, 2, 1, 1 };
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null);
}
- /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
+ /**
+ * Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
+ */
public void testPartsAndCatenate() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
- .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+ .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactoryTests.java
index 1e919e00bb..78c4f1485a 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java
+++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactoryTests.java
@@ -16,31 +16,38 @@
* specific language governing permissions and limitations
* under the License.
*/
-package org.elasticsearch.index.analysis;
+package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.analysis.AnalysisTestsHelper;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
import java.io.StringReader;
-public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
+public class WordDelimiterTokenFilterFactoryTests
+ extends BaseWordDelimiterTokenFilterFactoryTestCase {
public WordDelimiterTokenFilterFactoryTests() {
super("word_delimiter");
}
- /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
+ /**
+ * Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
+ */
public void testPartsAndCatenate() throws IOException {
- ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
- .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
- .put("index.analysis.filter.my_word_delimiter.type", type)
- .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
- .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
- .build());
+ ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
+ Settings.builder()
+ .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+ .put("index.analysis.filter.my_word_delimiter.type", type)
+ .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
+ .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
+ .build(),
+ new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[]{"Power", "PowerShot", "Shot" };
diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/10_basic.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/10_basic.yaml
new file mode 100644
index 0000000000..d27a0861b2
--- /dev/null
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/10_basic.yaml
@@ -0,0 +1,11 @@
+"Module loaded":
+ - do:
+ cluster.state: {}
+
+ # Get master node id
+ - set: { master_node: master }
+
+ - do:
+ nodes.info: {}
+
+ - match: { nodes.$master.modules.0.name: analysis-common }
diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yaml
new file mode 100644
index 0000000000..9fb34e7a82
--- /dev/null
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yaml
@@ -0,0 +1,11 @@
+## Smoke tests for analyzers included in the analysis-common module
+
+"whitespace":
+ - do:
+ indices.analyze:
+ body:
+ text: Foo Bar!
+ analyzer: whitespace
+ - length: { tokens: 2 }
+ - match: { tokens.0.token: Foo }
+ - match: { tokens.1.token: Bar! }
diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yaml
new file mode 100644
index 0000000000..174a15f772
--- /dev/null
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yaml
@@ -0,0 +1,27 @@
+## Smoke tests for tokenizers included in the analysis-common module
+
+"keyword":
+ - do:
+ indices.analyze:
+ body:
+ text: Foo Bar!
+ tokenizer: keyword
+ - length: { tokens: 1 }
+ - match: { tokens.0.token: Foo Bar! }
+
+---
+"nGram":
+ - do:
+ indices.analyze:
+ body:
+ text: good
+ explain: true
+ tokenizer:
+ type: nGram
+ min_gram: 2
+ max_gram: 2
+ - length: { detail.tokenizer.tokens: 3 }
+ - match: { detail.tokenizer.name: _anonymous_tokenizer }
+ - match: { detail.tokenizer.tokens.0.token: go }
+ - match: { detail.tokenizer.tokens.1.token: oo }
+ - match: { detail.tokenizer.tokens.2.token: od }
diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml
new file mode 100644
index 0000000000..ac5bcb82e5
--- /dev/null
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml
@@ -0,0 +1,82 @@
+## Smoke tests for token filters included in the analysis-common module
+
+"asciifolding":
+ - do:
+ indices.analyze:
+ body:
+ text: Musée d'Orsay
+ tokenizer: keyword
+ filter: [asciifolding]
+ - length: { tokens: 1 }
+ - match: { tokens.0.token: Musee d'Orsay }
+
+---
+"lowercase":
+ - do:
+ indices.analyze:
+ body:
+ text: Foo Bar!
+ tokenizer: keyword
+ filter: [lowercase]
+ - length: { tokens: 1 }
+ - match: { tokens.0.token: foo bar! }
+
+---
+"word_delimiter":
+ - do:
+ indices.analyze:
+ body:
+ text: the qu1ck brown fox
+ tokenizer: standard
+ filter: [word_delimiter]
+ - length: { tokens: 6 }
+ - match: { tokens.0.token: the }
+ - match: { tokens.1.token: qu }
+ - match: { tokens.2.token: "1" }
+ - match: { tokens.3.token: ck }
+ - match: { tokens.4.token: brown }
+ - match: { tokens.5.token: fox }
+
+ - do:
+ indices.analyze:
+ body:
+ text: the qu1ck brown fox
+ tokenizer: standard
+ filter:
+ - type: word_delimiter
+ split_on_numerics: false
+ - length: { tokens: 4 }
+ - match: { tokens.0.token: the }
+ - match: { tokens.1.token: qu1ck }
+ - match: { tokens.2.token: brown }
+ - match: { tokens.3.token: fox }
+
+---
+"word_delimiter_graph":
+ - do:
+ indices.analyze:
+ body:
+ text: the qu1ck brown fox
+ tokenizer: standard
+ filter: [word_delimiter_graph]
+ - length: { tokens: 6 }
+ - match: { tokens.0.token: the }
+ - match: { tokens.1.token: qu }
+ - match: { tokens.2.token: "1" }
+ - match: { tokens.3.token: ck }
+ - match: { tokens.4.token: brown }
+ - match: { tokens.5.token: fox }
+
+ - do:
+ indices.analyze:
+ body:
+ text: the qu1ck brown fox
+ tokenizer: standard
+ filter:
+ - type: word_delimiter_graph
+ split_on_numerics: false
+ - length: { tokens: 4 }
+ - match: { tokens.0.token: the }
+ - match: { tokens.1.token: qu1ck }
+ - match: { tokens.2.token: brown }
+ - match: { tokens.3.token: fox }
diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/50_char_filters.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/50_char_filters.yaml
new file mode 100644
index 0000000000..06775a2a72
--- /dev/null
+++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/50_char_filters.yaml
@@ -0,0 +1,13 @@
+## Smoke tests for analyzers included in the analysis-common module
+
+"mapping":
+ - do:
+ indices.analyze:
+ body:
+ text: jeff quit phish
+ tokenizer: keyword
+ char_filter:
+ - type: mapping
+ mappings: ["ph => f", "qu => q"]
+ - length: { tokens: 1 }
+ - match: { tokens.0.token: "jeff qit fish" }
diff --git a/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java
index e68cb260b0..8301529627 100644
--- a/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java
+++ b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java
@@ -19,14 +19,9 @@
package org.elasticsearch.index.analysis;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.AnalysisFactoryTestCase;
import org.elasticsearch.Version;
@@ -37,6 +32,10 @@ import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase {
@Override
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yaml b/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yaml
index 268cd78128..93ce5c8c80 100644
--- a/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yaml
+++ b/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yaml
@@ -1,29 +1,11 @@
-# Will be performed before each test as a part of the test setup
-#
-setup:
- - do:
- ping: {}
-
----
"Basic test":
- do:
indices.analyze:
body:
text: Foo Bar
- length: { tokens: 2 }
- - match: { tokens.0.token: foo }
- - match: { tokens.1.token: bar }
-
----
-"Tokenizer and filter":
- - do:
- indices.analyze:
- body:
- filter: [lowercase]
- text: Foo Bar
- tokenizer: keyword
- - length: { tokens: 1 }
- - match: { tokens.0.token: foo bar }
+ - match: { tokens.0.token: foo }
+ - match: { tokens.1.token: bar }
---
"Index and field":
@@ -36,7 +18,7 @@ setup:
properties:
text:
type: text
- analyzer: whitespace
+ analyzer: standard
- do:
indices.analyze:
@@ -45,84 +27,51 @@ setup:
field: text
text: Foo Bar!
- length: { tokens: 2 }
- - match: { tokens.0.token: Foo }
- - match: { tokens.1.token: Bar! }
----
-"JSON in Body":
- - do:
- indices.analyze:
- body: { "text": "Foo Bar", "filter": ["lowercase"], "tokenizer": keyword }
- - length: {tokens: 1 }
- - match: { tokens.0.token: foo bar }
+ - match: { tokens.0.token: foo }
+ - match: { tokens.1.token: bar }
+
---
"Array text":
- do:
indices.analyze:
- body: { "text": ["Foo Bar", "Baz"], "filter": ["lowercase"], "tokenizer": keyword }
- - length: {tokens: 2 }
- - match: { tokens.0.token: foo bar }
- - match: { tokens.1.token: baz }
+ body:
+ text: ["Foo Bar", "Baz"]
+ tokenizer: standard
+ - length: { tokens: 3 }
+ - match: { tokens.0.token: Foo }
+ - match: { tokens.1.token: Bar }
+ - match: { tokens.2.token: Baz }
+
---
"Detail response with Analyzer":
- do:
indices.analyze:
- body: {"text": "This is troubled", "analyzer": standard, "explain": "true"}
+ body:
+ text: This is troubled
+ analyzer: standard
+ explain: true
- length: { detail.analyzer.tokens: 3 }
- - match: { detail.analyzer.name: standard }
- - match: { detail.analyzer.tokens.0.token: this }
- - match: { detail.analyzer.tokens.1.token: is }
- - match: { detail.analyzer.tokens.2.token: troubled }
----
-"Detail output spcified attribute":
- - do:
- indices.analyze:
- body: {"text": "<text>This is troubled</text>", "char_filter": ["html_strip"], "filter": ["snowball"], "tokenizer": standard, "explain": true, "attributes": ["keyword"]}
- - length: { detail.charfilters: 1 }
- - length: { detail.tokenizer.tokens: 3 }
- - length: { detail.tokenfilters.0.tokens: 3 }
- - match: { detail.tokenizer.name: standard }
- - match: { detail.tokenizer.tokens.0.token: This }
- - match: { detail.tokenizer.tokens.1.token: is }
- - match: { detail.tokenizer.tokens.2.token: troubled }
- - match: { detail.tokenfilters.0.name: snowball }
- - match: { detail.tokenfilters.0.tokens.0.token: This }
- - match: { detail.tokenfilters.0.tokens.1.token: is }
- - match: { detail.tokenfilters.0.tokens.2.token: troubl }
- - match: { detail.tokenfilters.0.tokens.2.keyword: false }
+ - match: { detail.analyzer.name: standard }
+ - match: { detail.analyzer.tokens.0.token: this }
+ - match: { detail.analyzer.tokens.1.token: is }
+ - match: { detail.analyzer.tokens.2.token: troubled }
---
"Custom filter in request":
- do:
indices.analyze:
- body: { "text": "Foo Bar Buzz", "filter": ["lowercase", { "type": "stop", "stopwords": ["foo", "buzz"]}], "tokenizer": whitespace, "explain": true }
- - length: {detail.tokenizer.tokens: 3 }
- - length: {detail.tokenfilters.0.tokens: 3 }
- - length: {detail.tokenfilters.1.tokens: 1 }
- - match: { detail.tokenizer.name: whitespace }
- - match: { detail.tokenizer.tokens.0.token: Foo }
- - match: { detail.tokenizer.tokens.1.token: Bar }
- - match: { detail.tokenizer.tokens.2.token: Buzz }
- - match: { detail.tokenfilters.0.name: lowercase }
- - match: { detail.tokenfilters.0.tokens.0.token: foo }
- - match: { detail.tokenfilters.0.tokens.1.token: bar }
- - match: { detail.tokenfilters.0.tokens.2.token: buzz }
- - match: { detail.tokenfilters.1.name: "_anonymous_tokenfilter_[1]" }
- - match: { detail.tokenfilters.1.tokens.0.token: bar }
----
-"Custom char_filter in request":
- - do:
- indices.analyze:
- body: { "text": "jeff quit phish", "char_filter": [{"type": "mapping", "mappings": ["ph => f", "qu => q"]}], "tokenizer": keyword }
- - length: {tokens: 1 }
- - match: { tokens.0.token: "jeff qit fish" }
-
----
-"Custom tokenizer in request":
- - do:
- indices.analyze:
- body: { "text": "good", "tokenizer": {"type": "nGram", "min_gram": 2, "max_gram": 2}, "explain": true }
- - length: {detail.tokenizer.tokens: 3 }
- - match: { detail.tokenizer.name: _anonymous_tokenizer }
- - match: { detail.tokenizer.tokens.0.token: go }
- - match: { detail.tokenizer.tokens.1.token: oo }
- - match: { detail.tokenizer.tokens.2.token: od }
+ body:
+ text: foo bar buzz
+ tokenizer: standard
+ explain: true
+ filter:
+ - type: stop
+ stopwords: ["foo", "buzz"]
+ - length: { detail.tokenizer.tokens: 3 }
+ - length: { detail.tokenfilters.0.tokens: 1 }
+ - match: { detail.tokenizer.name: standard }
+ - match: { detail.tokenizer.tokens.0.token: foo }
+ - match: { detail.tokenizer.tokens.1.token: bar }
+ - match: { detail.tokenizer.tokens.2.token: buzz }
+ - match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" }
+ - match: { detail.tokenfilters.0.tokens.0.token: bar }
diff --git a/settings.gradle b/settings.gradle
index 8e6d3d80a0..36f9c23e7c 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -26,14 +26,15 @@ List projects = [
'test:fixtures:hdfs-fixture',
'test:logger-usage',
'modules:aggs-matrix-stats',
+ 'modules:analysis-common',
'modules:ingest-common',
'modules:lang-expression',
'modules:lang-mustache',
'modules:lang-painless',
- 'modules:transport-netty4',
- 'modules:reindex',
'modules:percolator',
+ 'modules:reindex',
'modules:repository-url',
+ 'modules:transport-netty4',
'plugins:analysis-icu',
'plugins:analysis-kuromoji',
'plugins:analysis-phonetic',
diff --git a/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java
index 83f955296b..7f60058788 100644
--- a/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java
+++ b/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java
@@ -20,14 +20,12 @@
package org.elasticsearch;
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
-import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.elasticsearch.common.collect.MapBuilder;
-import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
@@ -92,7 +90,6 @@ import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
-import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
@@ -110,7 +107,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
- * Alerts us if new analyzers are added to lucene, so we don't miss them.
+ * Alerts us if new analysis components are added to Lucene, so we don't miss them.
* <p>
* If we don't want to expose one for a specific reason, just map it to Void.
* The deprecated ones can be mapped to Deprecated.class.
@@ -178,7 +175,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
.put("apostrophe", ApostropheFilterFactory.class)
.put("arabicnormalization", ArabicNormalizationFilterFactory.class)
.put("arabicstem", ArabicStemTokenFilterFactory.class)
- .put("asciifolding", ASCIIFoldingTokenFilterFactory.class)
+ .put("asciifolding", MovedToAnalysisCommon.class)
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
.put("bulgarianstem", StemmerTokenFilterFactory.class)
.put("cjkbigram", CJKBigramFilterFactory.class)
@@ -253,8 +250,8 @@ public class AnalysisFactoryTestCase extends ESTestCase {
.put("turkishlowercase", LowerCaseTokenFilterFactory.class)
.put("type", KeepTypesFilterFactory.class)
.put("uppercase", UpperCaseTokenFilterFactory.class)
- .put("worddelimiter", WordDelimiterTokenFilterFactory.class)
- .put("worddelimitergraph", WordDelimiterGraphFilterFactory.class)
+ .put("worddelimiter", MovedToAnalysisCommon.class)
+ .put("worddelimitergraph", MovedToAnalysisCommon.class)
.put("flattengraph", FlattenGraphTokenFilterFactory.class)
// TODO: these tokenfilters are not yet exposed: useful?
@@ -401,6 +398,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
}
}
expected.remove(Void.class);
+ expected.remove(MovedToAnalysisCommon.class);
expected.remove(Deprecated.class);
Collection<Class<?>> actual = new HashSet<>();
@@ -489,4 +487,11 @@ public class AnalysisFactoryTestCase extends ESTestCase {
classesThatShouldNotHaveMultiTermSupport.isEmpty());
}
+ /**
+ * Marker class for components that have moved to the analysis-common modules. This will be
+ * removed when the module is complete and these analysis components aren't available to core.
+ */
+ protected static final class MovedToAnalysisCommon {
+ private MovedToAnalysisCommon() {}
+ }
}
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java b/test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java
index a60c21c1a7..d75a894d07 100644
--- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java
+++ b/test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java
@@ -25,17 +25,18 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import java.io.IOException;
import java.nio.file.Path;
-
-import static java.util.Collections.emptyList;
+import java.util.Arrays;
public class AnalysisTestsHelper {
- public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir, String resource) throws IOException {
+ public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir,
+ String resource) throws IOException {
Settings settings = Settings.builder()
.loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource))
.put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString())
@@ -45,12 +46,15 @@ public class AnalysisTestsHelper {
}
public static ESTestCase.TestAnalysis createTestAnalysisFromSettings(
- Settings settings) throws IOException {
+ Settings settings, AnalysisPlugin... plugins) throws IOException {
if (settings.get(IndexMetaData.SETTING_VERSION_CREATED) == null) {
- settings = Settings.builder().put(settings).put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
+ settings = Settings.builder().put(settings)
+ .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
}
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
- AnalysisRegistry analysisRegistry = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry();
+ AnalysisRegistry analysisRegistry =
+ new AnalysisModule(new Environment(settings), Arrays.asList(plugins))
+ .getAnalysisRegistry();
return new ESTestCase.TestAnalysis(analysisRegistry.build(indexSettings),
analysisRegistry.buildTokenFilterFactories(indexSettings),
analysisRegistry.buildTokenizerFactories(indexSettings),