From a9a0f262af69a6ee4c09bcb286b42db0a78adf0a Mon Sep 17 00:00:00 2001 From: Jun Ohtani Date: Wed, 16 Mar 2016 17:43:21 +0900 Subject: Analysis Kuromoji: Add nbest option and NumberFilter Add nbest_cost and nbest_examples parameter to KuromojiTokenizerFactory Add KuromojiNumberFilterFactory --- .../analysis/KuromojiNumberFilterFactory.java | 37 +++++++++++++++ .../index/analysis/KuromojiTokenizerFactory.java | 14 +++++- .../analysis/kuromoji/AnalysisKuromojiPlugin.java | 2 + .../index/analysis/KuromojiAnalysisTests.java | 52 ++++++++++++++++++++++ .../index/analysis/kuromoji_analysis.json | 14 +++++- 5 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java (limited to 'plugins/analysis-kuromoji') diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java new file mode 100644 index 0000000000..cb6b478957 --- /dev/null +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiNumberFilterFactory.java @@ -0,0 +1,37 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseNumberFilter; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +public class KuromojiNumberFilterFactory extends AbstractTokenFilterFactory { + + public KuromojiNumberFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseNumberFilter(tokenStream); + } +} diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java index 87e08c757b..9e41621525 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiTokenizerFactory.java @@ -36,9 +36,13 @@ import java.io.Reader; public class KuromojiTokenizerFactory extends AbstractTokenizerFactory { private static final String USER_DICT_OPTION = "user_dictionary"; + private static final String NBEST_COST = "nbest_cost"; + private static final String NBEST_EXAMPLES = "nbest_examples"; private final UserDictionary userDictionary; private final Mode mode; + private final String nBestExamples; + private final int nBestCost; private boolean discartPunctuation; @@ -47,6 +51,8 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory { mode = getMode(settings); userDictionary = getUserDictionary(env, settings); discartPunctuation = settings.getAsBoolean("discard_punctuation", true); + nBestCost = settings.getAsInt(NBEST_COST, -1); + nBestExamples = settings.get(NBEST_EXAMPLES); } public static UserDictionary getUserDictionary(Environment env, Settings settings) { @@ -83,7 +89,13 @@ public class KuromojiTokenizerFactory extends AbstractTokenizerFactory { @Override public Tokenizer create() { - return new JapaneseTokenizer(userDictionary, discartPunctuation, mode); + JapaneseTokenizer t = new JapaneseTokenizer(userDictionary, discartPunctuation, mode); + int nBestCost = this.nBestCost; + if (nBestExamples != null) { + nBestCost = Math.max(nBestCost, t.calcNBestCost(nBestExamples)); + } + t.setNBestCost(nBestCost); + return t; } } diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java index 6c0a15f2e3..4208b1be50 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java @@ -24,6 +24,7 @@ import org.elasticsearch.index.analysis.KuromojiAnalyzerProvider; import org.elasticsearch.index.analysis.KuromojiBaseFormFilterFactory; import org.elasticsearch.index.analysis.KuromojiIterationMarkCharFilterFactory; import org.elasticsearch.index.analysis.KuromojiKatakanaStemmerFactory; +import org.elasticsearch.index.analysis.KuromojiNumberFilterFactory; import org.elasticsearch.index.analysis.KuromojiPartOfSpeechFilterFactory; import org.elasticsearch.index.analysis.KuromojiReadingFormFilterFactory; import org.elasticsearch.index.analysis.KuromojiTokenizerFactory; @@ -55,5 +56,6 @@ public class AnalysisKuromojiPlugin extends Plugin { module.registerTokenFilter("kuromoji_readingform", KuromojiReadingFormFilterFactory::new); module.registerTokenFilter("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new); module.registerTokenFilter("ja_stop", JapaneseStopTokenFilterFactory::new); + module.registerTokenFilter("kuromoji_number", KuromojiNumberFilterFactory::new); } } diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java index b81de20d73..04d8d64cc7 100644 --- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java +++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/KuromojiAnalysisTests.java @@ -24,7 +24,11 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.analysis.ja.JapaneseTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.elasticsearch.Version; +import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.inject.ModulesBuilder; @@ -75,6 +79,9 @@ public class KuromojiAnalysisTests extends ESTestCase { filterFactory = analysisService.tokenFilter("ja_stop"); assertThat(filterFactory, instanceOf(JapaneseStopTokenFilterFactory.class)); + filterFactory = analysisService.tokenFilter("kuromoji_number"); + assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class)); + NamedAnalyzer analyzer = analysisService.analyzer("kuromoji"); assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class)); @@ -262,4 +269,49 @@ public class KuromojiAnalysisTests extends ESTestCase { TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_empty_user_dict"); assertThat(tokenizerFactory, instanceOf(KuromojiTokenizerFactory.class)); } + + public void testNbestCost() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_cost"); + String source = "鳩山積み"; + String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"}; + + Tokenizer tokenizer = tokenizerFactory.create(); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenizer, expected); + } + + public void testNbestExample() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_examples"); + String source = "鳩山積み"; + String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"}; + + Tokenizer tokenizer = tokenizerFactory.create(); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenizer, expected); + } + + public void testNbestBothOptions() throws IOException { + AnalysisService analysisService = createAnalysisService(); + TokenizerFactory tokenizerFactory = analysisService.tokenizer("kuromoji_nbest_both"); + String source = "鳩山積み"; + String[] expected = new String[] {"鳩", "鳩山", "山積み", "積み"}; + + Tokenizer tokenizer = tokenizerFactory.create(); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenizer, expected); + + } + + public void testNumberFilterFactory() throws Exception { + AnalysisService analysisService = createAnalysisService(); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("kuromoji_number"); + assertThat(tokenFilter, instanceOf(KuromojiNumberFilterFactory.class)); + String source = "本日十万二千五百円のワインを買った"; + String[] expected = new String[]{"本日", "102500", "円", "の", "ワイン", "を", "買っ", "た"}; + Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); + } } diff --git a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json index 58ed015b85..d0f94a2117 100644 --- a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json +++ b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/index/analysis/kuromoji_analysis.json @@ -18,7 +18,6 @@ "type": "ja_stop", "stopwords": ["_japanese_", "スピード"] } - }, "char_filter":{ @@ -48,6 +47,19 @@ "kuromoji_user_dict" : { "type":"kuromoji_tokenizer", "user_dictionary":"user_dict.txt" + }, + "kuromoji_nbest_cost" : { + "type": "kuromoji_tokenizer", + "nbest_cost" : "2000" + }, + "kuromoji_nbest_examples" : { + "type": "kuromoji_tokenizer", + "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/" + }, + "kuromoji_nbest_both" : { + "type": "kuromoji_tokenizer", + "nbest_examples" : "/鳩山積み-鳩山/鳩山積み-鳩/", + "nbest_cost" : "1000" } }, "analyzer" : { -- cgit v1.2.3