summaryrefslogtreecommitdiff
path: root/plugins/analysis-kuromoji
diff options
context:
space:
mode:
authorJun Ohtani <johtani@gmail.com>2016-03-16 17:43:21 +0900
committerJun Ohtani <johtani@gmail.com>2016-03-22 20:09:56 +0900
commita9a0f262af69a6ee4c09bcb286b42db0a78adf0a (patch)
tree6eed7c3b0556e9e1cad2de957cafa530788f400b /plugins/analysis-kuromoji
parentb07a8185a7a3e9e5ea34e5a2aa11584f8d37b042 (diff)
Analysis Kuromoji: Add nbest option and NumberFilter
Add nbest_cost and nbest_examples parameter to KuromojiTokenizerFactory Add KuromojiNumberFilterFactory
Diffstat (limited to 'plugins/analysis-kuromoji')
-rw-r--r--plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java37
-rw-r--r--plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java14
-rw-r--r--plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java2
-rw-r--r--plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java52
-rw-r--r--plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json14
5 files changed, 117 insertions, 2 deletions
diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java
new file mode 100644
index 0000000000..cb6b478957
--- /dev/null
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ja.JapaneseNumberFilter;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+
+public class KuromojiNumberFilterFactory extends AbstractTokenFilterFactory {
+
+ public KuromojiNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
+ super(indexSettings, name, settings);
+ }
+
+ @Override
+ public TokenStream create(TokenStream tokenStream) {
+ return new JapaneseNumberFilter(tokenStream);
+ }
+}
diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
index 87e08c757b..9e41621525 100644
--- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java
@@ -36,9 +36,13 @@ import java.io.Reader;
public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
private static final String USER_DICT_OPTION = "user_dictionary";
+ private static final String NBEST_COST = "nbest_cost";
+ private static final String NBEST_EXAMPLES = "nbest_examples";
private final UserDictionary userDictionary;
private final Mode mode;
+ private final String nBestExamples;
+ private final int nBestCost;
private boolean discartPunctuation;
@@ -47,6 +51,8 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
mode = getMode(settings);
userDictionary = getUserDictionary(env, settings);
discartPunctuation = settings.getAsBoolean("discard_punctuation", true);
+ nBestCost = settings.getAsInt(NBEST_COST, -1);
+ nBestExamples = settings.get(NBEST_EXAMPLES);
}
public static UserDictionary getUserDictionary(Environment env, Settings settings) {
@@ -83,7 +89,13 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory {
@Override
public Tokenizer create() {
- return new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
+ JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode);
+ int nBestCost = this.nBestCost;
+ if (nBestExamples != null) {
+ nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples));
+ }
+ t.setNBestCost(nBestCost);
+ return t;
}
}
diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
index 6c0a15f2e3..4208b1be50 100644
--- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
+++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java
@@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider;
import org.elasticsearch.index.analysis.KuromojiBaseFormFilterFactory;
import org.elasticsearch.index.analysis.KuromojiIterationMarkCharFilterFactory;
import org.elasticsearch.index.analysis.KuromojiKatakanaStemmerFactory;
+import org.elasticsearch.index.analysis.KuromojiNumberFilterFactory;
import org.elasticsearch.index.analysis.KuromojiPartOfSpeechFilterFactory;
import org.elasticsearch.index.analysis.KuromojiReadingFormFilterFactory;
import org.elasticsearch.index.analysis.KuromojiTokenizerFactory;
@@ -55,5 +56,6 @@ public class AnalysisKuromojiPlugin extends Plugin {
module.registerTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory::new);
module.registerTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
module.registerTokenFilter("ja_stop", JapaneseStopTokenFilterFactory::new);
+ module.registerTokenFilter("kuromoji_number", KuromojiNumberFilterFactory::new);
}
}
diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
index b81de20d73..04d8d64cc7 100644
--- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
+++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java
@@ -24,7 +24,11 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.Version;
+import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
@@ -75,6 +79,9 @@ public class KuromojiAnalysisTests extends ESTestCase {
filterFactory = analysisService.tokenFilter("ja_stop");
assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class));
+ filterFactory = analysisService.tokenFilter("kuromoji_number");
+ assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));
+
NamedAnalyzer analyzer = analysisService.analyzer("kuromoji");
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
@@ -262,4 +269,49 @@ public class KuromojiAnalysisTests extends ESTestCase {
TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_empty_user_dict");
assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class));
}
+
+ public void testNbestCost() throws IOException {
+ AnalysisService analysisService = createAnalysisService();
+ TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_cost");
+ String source = "鳩山積み";
+ String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
+
+ Tokenizer tokenizer = tokenizerFactory.create();
+ tokenizer.setReader(new StringReader(source));
+ assertSimpleTSOutput(tokenizer, expected);
+ }
+
+ public void testNbestExample() throws IOException {
+ AnalysisService analysisService = createAnalysisService();
+ TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_examples");
+ String source = "鳩山積み";
+ String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
+
+ Tokenizer tokenizer = tokenizerFactory.create();
+ tokenizer.setReader(new StringReader(source));
+ assertSimpleTSOutput(tokenizer, expected);
+ }
+
+ public void testNbestBothOptions() throws IOException {
+ AnalysisService analysisService = createAnalysisService();
+ TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_both");
+ String source = "鳩山積み";
+ String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"};
+
+ Tokenizer tokenizer = tokenizerFactory.create();
+ tokenizer.setReader(new StringReader(source));
+ assertSimpleTSOutput(tokenizer, expected);
+
+ }
+
+ public void testNumberFilterFactory() throws Exception {
+ AnalysisService analysisService = createAnalysisService();
+ TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_number");
+ assertThat(tokenFilter, instanceOf(KuromojiNumberFilterFactory.class));
+ String source = "本日十万二千五百円のワインを買った";
+ String[] expected = new String[]{"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"};
+ Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
+ tokenizer.setReader(new StringReader(source));
+ assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+ }
}
diff --git a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json
index 58ed015b85..d0f94a2117 100644
--- a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json
+++ b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json
@@ -18,7 +18,6 @@
"type": "ja_stop",
"stopwords": ["_japanese_", "スピード"]
}
-
},
"char_filter":{
@@ -48,6 +47,19 @@
"kuromoji_user_dict" : {
"type":"kuromoji_tokenizer",
"user_dictionary":"user_dict.txt"
+ },
+ "kuromoji_nbest_cost" : {
+ "type": "kuromoji_tokenizer",
+ "nbest_cost" : "2000"
+ },
+ "kuromoji_nbest_examples" : {
+ "type": "kuromoji_tokenizer",
+ "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/"
+ },
+ "kuromoji_nbest_both" : {
+ "type": "kuromoji_tokenizer",
+ "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/",
+ "nbest_cost" : "1000"
}
},
"analyzer" : {