diff options
author | Martijn van Groningen <martijn.v.groningen@gmail.com> | 2017-06-14 01:26:36 +0200 |
---|---|---|
committer | Martijn van Groningen <martijn.v.groningen@gmail.com> | 2017-06-15 18:28:31 +0200 |
commit | 428e70758ac6895ac995f4315412f4d3729aea9b (patch) | |
tree | bb6404aac053c5ece590214a33e02304c2bab694 /core/src/test/java/org/elasticsearch/index | |
parent | 2a78b0a19fb6584944d92ad34a91f2814b3dcbe4 (diff) |
Moved more token filters to analysis-common module.
The following token filters were moved: `edge_ngram`, `ngram`, `uppercase`, `lowercase`, `length`, `flatten_graph` and `unique`.
Relates to #23658
Diffstat (limited to 'core/src/test/java/org/elasticsearch/index')
2 files changed, 0 insertions, 225 deletions
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactoryTests.java deleted file mode 100644 index 259da010da..0000000000 --- a/core/src/test/java/org/elasticsearch/index/analysis/FlattenGraphTokenFilterFactoryTests.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import java.io.IOException; - -import org.apache.lucene.analysis.CannedTokenStream; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.Index; -import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.test.ESTokenStreamTestCase; -import org.elasticsearch.test.IndexSettingsModule; - -public class FlattenGraphTokenFilterFactoryTests extends ESTokenStreamTestCase { - - public void testBasic() throws IOException { - - Index index = new Index("test", "_na_"); - String name = "ngr"; - Settings indexSettings = newAnalysisSettingsBuilder().build(); - IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); - Settings settings = newAnalysisSettingsBuilder().build(); - - // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input: - TokenStream in = new CannedTokenStream(0, 12, new Token[] { - token("wtf", 1, 5, 0, 3), - token("what", 0, 1, 0, 3), - token("wow", 0, 3, 0, 3), - token("the", 1, 1, 0, 3), - token("fudge", 1, 3, 0, 3), - token("that's", 1, 1, 0, 3), - token("funny", 1, 1, 0, 3), - token("happened", 1, 1, 4, 12) - }); - - TokenStream tokens = new FlattenGraphTokenFilterFactory(indexProperties, null, name, settings).create(in); - - // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: - assertTokenStreamContents(tokens, - new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"}, - new int[] {0, 0, 0, 0, 0, 0, 0, 4}, - new int[] {3, 3, 3, 3, 3, 3, 3, 12}, - new int[] {1, 0, 0, 1, 0, 1, 0, 1}, - new int[] {3, 1, 1, 1, 1, 1, 1, 1}, - 12); - } - - private static Token token(String term, int posInc, int posLength, int startOffset, int endOffset) { - final Token t = new Token(term, startOffset, endOffset); - t.setPositionIncrement(posInc); - t.setPositionLength(posLength); - return t; - } -} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/NGramTokenizerFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/NGramTokenizerFactoryTests.java deleted file mode 100644 index 5e1cf2e817..0000000000 --- a/core/src/test/java/org/elasticsearch/index/analysis/NGramTokenizerFactoryTests.java +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; -import org.apache.lucene.analysis.reverse.ReverseStringFilter; -import org.elasticsearch.Version; -import org.elasticsearch.cluster.metadata.IndexMetaData; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.settings.Settings.Builder; -import org.elasticsearch.index.Index; -import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.test.ESTokenStreamTestCase; -import org.elasticsearch.test.IndexSettingsModule; - -import java.io.IOException; -import java.io.StringReader; -import java.lang.reflect.Field; -import java.lang.reflect.Modifier; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; - -import static com.carrotsearch.randomizedtesting.RandomizedTest.scaledRandomIntBetween; -import static org.hamcrest.Matchers.instanceOf; - -public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { - public void testParseTokenChars() { - final Index index = new Index("test", "_na_"); - final String name = "ngr"; - final Settings indexSettings = newAnalysisSettingsBuilder().build(); - IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); - for (String tokenChars : Arrays.asList("letters", "number", "DIRECTIONALITY_UNDEFINED")) { - final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", tokenChars).build(); - try { - new NGramTokenizerFactory(indexProperties, null, name, settings).create(); - fail(); - } catch (IllegalArgumentException expected) { - // OK - } - } - for (String tokenChars : Arrays.asList("letter", " digit ", "punctuation", "DIGIT", "CoNtRoL", "dash_punctuation")) { - final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", tokenChars).build(); - indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); - - new NGramTokenizerFactory(indexProperties, null, name, settings).create(); - // no exception - } - } - - public void testNoTokenChars() throws IOException { - final Index index = new Index("test", "_na_"); - final String name = "ngr"; - final Settings indexSettings = newAnalysisSettingsBuilder().build(); - final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4).putArray("token_chars", new String[0]).build(); - Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); - tokenizer.setReader(new StringReader("1.34")); - assertTokenStreamContents(tokenizer, new String[] {"1.", "1.3", "1.34", ".3", ".34", "34"}); - } - - public void testPreTokenization() throws IOException { - // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters - final Index index = new Index("test", "_na_"); - final String name = "ngr"; - final Settings indexSettings = newAnalysisSettingsBuilder().build(); - Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build(); - Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); - tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f ")); - assertTokenStreamContents(tokenizer, - new String[] {"Åb", "Åbc", "bc", "dé", "déf", "éf", "g\uD801\uDC00", "g\uD801\uDC00f", "\uD801\uDC00f"}); - settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build(); - tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); - tokenizer.setReader(new StringReader(" a!$ 9")); - assertTokenStreamContents(tokenizer, - new String[] {" a", " a!", "a!", "a!$", "!$", "!$ ", "$ ", "$ 9", " 9"}); - } - - public void testPreTokenizationEdge() throws IOException { - // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters - final Index index = new Index("test", "_na_"); - final String name = "ngr"; - final Settings indexSettings = newAnalysisSettingsBuilder().build(); - Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build(); - Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); - tokenizer.setReader(new StringReader("Åbc déf g\uD801\uDC00f ")); - assertTokenStreamContents(tokenizer, - new String[] {"Åb", "Åbc", "dé", "déf", "g\uD801\uDC00", "g\uD801\uDC00f"}); - settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build(); - tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(); - tokenizer.setReader(new StringReader(" a!$ 9")); - assertTokenStreamContents(tokenizer, - new String[] {" a", " a!"}); - } - - public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception { - int iters = scaledRandomIntBetween(20, 100); - for (int i = 0; i < iters; i++) { - final Index index = new Index("test", "_na_"); - final String name = "ngr"; - Version v = randomVersion(random()); - Builder builder = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3); - boolean reverse = random().nextBoolean(); - if (reverse) { - builder.put("side", "back"); - } - Settings settings = builder.build(); - Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build(); - Tokenizer tokenizer = new MockTokenizer(); - tokenizer.setReader(new StringReader("foo bar")); - TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(tokenizer); - if (reverse) { - assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class)); - } else { - assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class)); - } - } - } - - - private Version randomVersion(Random random) throws IllegalArgumentException, IllegalAccessException { - Field[] declaredFields = Version.class.getFields(); - List<Field> versionFields = new ArrayList<>(); - for (Field field : declaredFields) { - if ((field.getModifiers() & Modifier.STATIC) != 0 && field.getName().startsWith("V_") && field.getType() == Version.class) { - versionFields.add(field); - } - } - return (Version) versionFields.get(random.nextInt(versionFields.size())).get(Version.class); - } - -} |