summaryrefslogtreecommitdiff
path: root/core/src/test/java
diff options
context:
space:
mode:
authormarkharwood <markharwood@gmail.com>2017-06-23 15:34:38 +0100
committerGitHub <noreply@github.com>2017-06-23 15:34:38 +0100
commit973530f953b193c797047286be3e07ca7dce17e8 (patch)
tree33be40ad09a685febad9574f38e5303a934c1f18 /core/src/test/java
parent9ff1698aa7f408ea8860877109fbad745622c717 (diff)
Added unit test coverage for SignificantTerms (#24904)
Added unit test coverage for GlobalOrdinalsSignificantTermsAggregator, GlobalOrdinalsSignificantTermsAggregator.WithHash, SignificantLongTermsAggregator and SignificantStringTermsAggregator. Removed integration test. Relates #22278
Diffstat (limited to 'core/src/test/java')
-rw-r--r--core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java473
-rw-r--r--core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java2
-rw-r--r--core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java214
3 files changed, 215 insertions, 474 deletions
diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java
deleted file mode 100644
index bff7471e86..0000000000
--- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsIT.java
+++ /dev/null
@@ -1,473 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.elasticsearch.search.aggregations.bucket;
-
-import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
-import org.elasticsearch.action.search.SearchPhaseExecutionException;
-import org.elasticsearch.action.search.SearchResponse;
-import org.elasticsearch.action.search.SearchType;
-import org.elasticsearch.cluster.metadata.IndexMetaData;
-import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.index.query.QueryBuilders;
-import org.elasticsearch.index.query.TermQueryBuilder;
-import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms;
-import org.elasticsearch.search.aggregations.bucket.significant.SignificantTerms.Bucket;
-import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode;
-import org.elasticsearch.search.aggregations.bucket.significant.heuristics.ChiSquare;
-import org.elasticsearch.search.aggregations.bucket.significant.heuristics.GND;
-import org.elasticsearch.search.aggregations.bucket.significant.heuristics.JLHScore;
-import org.elasticsearch.search.aggregations.bucket.significant.heuristics.MutualInformation;
-import org.elasticsearch.search.aggregations.bucket.significant.heuristics.PercentageScore;
-import org.elasticsearch.search.aggregations.bucket.terms.Terms;
-import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
-import org.elasticsearch.test.ESIntegTestCase;
-
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-
-import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
-import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
-import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
-import static org.elasticsearch.index.query.QueryBuilders.termQuery;
-import static org.elasticsearch.search.aggregations.AggregationBuilders.significantTerms;
-import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
-import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
-import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
-import static org.hamcrest.Matchers.containsString;
-import static org.hamcrest.Matchers.equalTo;
-import static org.hamcrest.Matchers.hasSize;
-import static org.hamcrest.Matchers.is;
-import static org.hamcrest.core.IsNull.notNullValue;
-
-@ESIntegTestCase.SuiteScopeTestCase
-public class SignificantTermsIT extends ESIntegTestCase {
-
- public String randomExecutionHint() {
- return randomBoolean() ? null : randomFrom(ExecutionMode.values()).toString();
- }
-
- @Override
- public Settings indexSettings() {
- return Settings.builder()
- .put("index.number_of_shards", numberOfShards())
- .put("index.number_of_replicas", numberOfReplicas())
- .build();
- }
-
- public static final int MUSIC_CATEGORY=1;
- public static final int OTHER_CATEGORY=2;
- public static final int SNOWBOARDING_CATEGORY=3;
-
- @Override
- public void setupSuiteScopeCluster() throws Exception {
- assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, 5, SETTING_NUMBER_OF_REPLICAS, 0).addMapping("fact",
- "_routing", "required=true", "routing_id", "type=keyword", "fact_category",
- "type=integer", "description", "type=text,fielddata=true"));
- createIndex("idx_unmapped");
-
- ensureGreen();
- String data[] = {
- "A\t1\tpaul weller was lead singer of the jam before the style council",
- "B\t1\tpaul weller left the jam to form the style council",
- "A\t2\tpaul smith is a designer in the fashion industry",
- "B\t1\tthe stranglers are a group originally from guildford",
- "A\t1\tafter disbanding the style council in 1985 paul weller became a solo artist",
- "B\t1\tjean jaques burnel is a bass player in the stranglers and has a black belt in karate",
- "A\t1\tmalcolm owen was the lead singer of the ruts",
- "B\t1\tpaul weller has denied any possibility of a reunion of the jam",
- "A\t1\tformer frontman of the jam paul weller became the father of twins",
- "B\t2\tex-england football star paul gascoigne has re-emerged following recent disappearance",
- "A\t2\tdavid smith has recently denied connections with the mafia",
- "B\t1\tthe damned's new rose single was considered the first 'punk' single in the UK",
- "A\t1\tthe sex pistols broke up after a few short years together",
- "B\t1\tpaul gascoigne was a midfielder for england football team",
- "A\t3\tcraig kelly became the first world champion snowboarder and has a memorial at baldface lodge",
- "B\t3\tterje haakonsen has credited craig kelly as his snowboard mentor",
- "A\t3\tterje haakonsen and craig kelly were some of the first snowboarders sponsored by burton snowboards",
- "B\t3\tlike craig kelly before him terje won the mt baker banked slalom many times - once riding switch",
- "A\t3\tterje haakonsen has been a team rider for burton snowboards for over 20 years"
- };
-
- for (int i = 0; i < data.length; i++) {
- String[] parts = data[i].split("\t");
- client().prepareIndex("test", "fact", "" + i)
- .setRouting(parts[0])
- .setSource("fact_category", parts[1], "description", parts[2]).get();
- }
- client().admin().indices().refresh(new RefreshRequest("test")).get();
-
- assertAcked(prepareCreate("test_not_indexed")
- .setSettings(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1)
- .addMapping("type",
- "my_keyword", "type=keyword,index=false",
- "my_long", "type=long,index=false"));
- indexRandom(true,
- client().prepareIndex("test_not_indexed", "type", "1").setSource(
- "my_keyword", "foo", "my_long", 42));
- }
-
- public void testStructuredAnalysis() throws Exception {
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms").field("fact_category").executionHint(randomExecutionHint())
- .minDocCount(2))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey();
- assertTrue(topCategory.equals(Long.valueOf(SNOWBOARDING_CATEGORY)));
- }
-
- public void testStructuredAnalysisWithIncludeExclude() throws Exception {
- long[] excludeTerms = { MUSIC_CATEGORY };
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "paul"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms").field("fact_category").executionHint(randomExecutionHint())
- .minDocCount(1).includeExclude(new IncludeExclude(null, excludeTerms)))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- Number topCategory = (Number) topTerms.getBuckets().iterator().next().getKey();
- assertTrue(topCategory.equals(Long.valueOf(OTHER_CATEGORY)));
- }
-
- public void testIncludeExclude() throws Exception {
- SearchResponse response = client().prepareSearch("test")
- .setQuery(new TermQueryBuilder("description", "weller"))
- .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
- .includeExclude(new IncludeExclude(null, "weller")))
- .get();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- Set<String> terms = new HashSet<>();
- for (Bucket topTerm : topTerms) {
- terms.add(topTerm.getKeyAsString());
- }
- assertThat(terms, hasSize(6));
- assertThat(terms.contains("jam"), is(true));
- assertThat(terms.contains("council"), is(true));
- assertThat(terms.contains("style"), is(true));
- assertThat(terms.contains("paul"), is(true));
- assertThat(terms.contains("of"), is(true));
- assertThat(terms.contains("the"), is(true));
-
- response = client().prepareSearch("test")
- .setQuery(new TermQueryBuilder("description", "weller"))
- .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
- .includeExclude(new IncludeExclude("weller", null)))
- .get();
- assertSearchResponse(response);
- topTerms = response.getAggregations().get("mySignificantTerms");
- terms = new HashSet<>();
- for (Bucket topTerm : topTerms) {
- terms.add(topTerm.getKeyAsString());
- }
- assertThat(terms, hasSize(1));
- assertThat(terms.contains("weller"), is(true));
- }
-
- public void testIncludeExcludeExactValues() throws Exception {
- String []incExcTerms={"weller","nosuchterm"};
- SearchResponse response = client().prepareSearch("test")
- .setQuery(new TermQueryBuilder("description", "weller"))
- .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
- .includeExclude(new IncludeExclude(null, incExcTerms)))
- .get();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- Set<String> terms = new HashSet<>();
- for (Bucket topTerm : topTerms) {
- terms.add(topTerm.getKeyAsString());
- }
- assertEquals(new HashSet<String>(Arrays.asList("jam", "council", "style", "paul", "of", "the")), terms);
-
- response = client().prepareSearch("test")
- .setQuery(new TermQueryBuilder("description", "weller"))
- .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
- .includeExclude(new IncludeExclude(incExcTerms, null)))
- .get();
- assertSearchResponse(response);
- topTerms = response.getAggregations().get("mySignificantTerms");
- terms = new HashSet<>();
- for (Bucket topTerm : topTerms) {
- terms.add(topTerm.getKeyAsString());
- }
- assertThat(terms, hasSize(1));
- assertThat(terms.contains("weller"), is(true));
- }
-
- public void testUnmapped() throws Exception {
- SearchResponse response = client().prepareSearch("idx_unmapped")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms").field("fact_category").executionHint(randomExecutionHint())
- .minDocCount(2))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- assertThat(topTerms.getBuckets().size(), equalTo(0));
- }
-
- public void testTextAnalysis() throws Exception {
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
- .minDocCount(2))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- checkExpectedStringTermsFound(topTerms);
- }
-
- public void testTextAnalysisGND() throws Exception {
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new GND(true))
- .minDocCount(2))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- checkExpectedStringTermsFound(topTerms);
- }
-
- public void testTextAnalysisChiSquare() throws Exception {
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint()).significanceHeuristic(new ChiSquare(false,true))
- .minDocCount(2))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- checkExpectedStringTermsFound(topTerms);
- }
-
- public void testTextAnalysisPercentageScore() throws Exception {
- SearchResponse response = client()
- .prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0)
- .setSize(60)
- .setExplain(true)
- .addAggregation(
- significantTerms("mySignificantTerms").field("description").executionHint(randomExecutionHint())
- .significanceHeuristic(new PercentageScore()).minDocCount(2)).execute().actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- checkExpectedStringTermsFound(topTerms);
- }
-
- public void testBadFilteredAnalysis() throws Exception {
- // Deliberately using a bad choice of filter here for the background context in order
- // to test robustness.
- // We search for the name of a snowboarder but use music-related content (fact_category:1)
- // as the background source of term statistics.
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms").field("description")
- .minDocCount(2).backgroundFilter(QueryBuilders.termQuery("fact_category", 1)))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- // We expect at least one of the significant terms to have been selected on the basis
- // that it is present in the foreground selection but entirely missing from the filtered
- // background used as context.
- boolean hasMissingBackgroundTerms = false;
- for (Bucket topTerm : topTerms) {
- if (topTerm.getSupersetDf() == 0) {
- hasMissingBackgroundTerms = true;
- break;
- }
- }
- assertTrue(hasMissingBackgroundTerms);
- }
-
- public void testFilteredAnalysis() throws Exception {
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "weller"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms").field("description")
- .minDocCount(1).backgroundFilter(QueryBuilders.termsQuery("description", "paul")))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- HashSet<String> topWords = new HashSet<String>();
- for (Bucket topTerm : topTerms) {
- topWords.add(topTerm.getKeyAsString());
- }
- //The word "paul" should be a constant of all docs in the background set and therefore not seen as significant
- assertFalse(topWords.contains("paul"));
- //"Weller" is the only Paul who was in The Jam and therefore this should be identified as a differentiator from the background of all other Pauls.
- assertTrue(topWords.contains("jam"));
- }
-
- public void testNestedAggs() throws Exception {
- String[][] expectedKeywordsByCategory={
- { "paul", "weller", "jam", "style", "council" },
- { "paul", "smith" },
- { "craig", "kelly", "terje", "haakonsen", "burton" }};
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .addAggregation(terms("myCategories").field("fact_category").minDocCount(2)
- .subAggregation(
- significantTerms("mySignificantTerms").field("description")
- .executionHint(randomExecutionHint())
- .minDocCount(2)))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- Terms topCategoryTerms = response.getAggregations().get("myCategories");
- for (org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket topCategory : topCategoryTerms.getBuckets()) {
- SignificantTerms topTerms = topCategory.getAggregations().get("mySignificantTerms");
- HashSet<String> foundTopWords = new HashSet<String>();
- for (Bucket topTerm : topTerms) {
- foundTopWords.add(topTerm.getKeyAsString());
- }
- String[] expectedKeywords = expectedKeywordsByCategory[Integer.parseInt(topCategory.getKeyAsString()) - 1];
- for (String expectedKeyword : expectedKeywords) {
- assertTrue(expectedKeyword + " missing from category keywords", foundTopWords.contains(expectedKeyword));
- }
- }
- }
-
- public void testPartiallyUnmapped() throws Exception {
- SearchResponse response = client().prepareSearch("idx_unmapped", "test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms").field("description")
- .executionHint(randomExecutionHint())
- .minDocCount(2))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- checkExpectedStringTermsFound(topTerms);
- }
-
- public void testPartiallyUnmappedWithFormat() throws Exception {
- SearchResponse response = client().prepareSearch("idx_unmapped", "test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(boolQuery().should(termQuery("description", "the")).should(termQuery("description", "terje")))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms")
- .field("fact_category")
- .executionHint(randomExecutionHint())
- .minDocCount(1)
- .format("0000"))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- for (int i = 1; i <= 3; i++) {
- String key = String.format(Locale.ROOT, "%04d", i);
- SignificantTerms.Bucket bucket = topTerms.getBucketByKey(key);
- assertThat(bucket, notNullValue());
- assertThat(bucket.getKeyAsString(), equalTo(key));
- }
- }
-
- private void checkExpectedStringTermsFound(SignificantTerms topTerms) {
- HashMap<String,Bucket>topWords=new HashMap<>();
- for (Bucket topTerm : topTerms ){
- topWords.put(topTerm.getKeyAsString(), topTerm);
- }
- assertTrue( topWords.containsKey("haakonsen"));
- assertTrue( topWords.containsKey("craig"));
- assertTrue( topWords.containsKey("kelly"));
- assertTrue( topWords.containsKey("burton"));
- assertTrue( topWords.containsKey("snowboards"));
- Bucket kellyTerm=topWords.get("kelly");
- assertEquals(3, kellyTerm.getSubsetDf());
- assertEquals(4, kellyTerm.getSupersetDf());
- }
-
- public void testDefaultSignificanceHeuristic() throws Exception {
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms")
- .field("description")
- .executionHint(randomExecutionHint())
- .significanceHeuristic(new JLHScore())
- .minDocCount(2))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- checkExpectedStringTermsFound(topTerms);
- }
-
- public void testMutualInformation() throws Exception {
- SearchResponse response = client().prepareSearch("test")
- .setSearchType(SearchType.QUERY_THEN_FETCH)
- .setQuery(new TermQueryBuilder("description", "terje"))
- .setFrom(0).setSize(60).setExplain(true)
- .addAggregation(significantTerms("mySignificantTerms")
- .field("description")
- .executionHint(randomExecutionHint())
- .significanceHeuristic(new MutualInformation(false, true))
- .minDocCount(1))
- .execute()
- .actionGet();
- assertSearchResponse(response);
- SignificantTerms topTerms = response.getAggregations().get("mySignificantTerms");
- checkExpectedStringTermsFound(topTerms);
- }
-
- public void testFailIfFieldNotIndexed() {
- SearchPhaseExecutionException e = expectThrows(SearchPhaseExecutionException.class,
- () -> client().prepareSearch("test_not_indexed").addAggregation(
- significantTerms("mySignificantTerms").field("my_keyword")).get());
- assertThat(e.toString(),
- containsString("Cannot search on field [my_keyword] since it is not indexed."));
-
- e = expectThrows(SearchPhaseExecutionException.class,
- () -> client().prepareSearch("test_not_indexed").addAggregation(
- significantTerms("mySignificantTerms").field("my_long")).get());
- assertThat(e.toString(),
- containsString("Cannot search on field [my_long] since it is not indexed."));
- }
-}
diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java
index 2dc208d89f..9c6615f8ff 100644
--- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java
+++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificanceHeuristicTests.java
@@ -135,7 +135,7 @@ public class SignificanceHeuristicTests extends ESTestCase {
}
}
- SignificanceHeuristic getRandomSignificanceheuristic() {
+ public static SignificanceHeuristic getRandomSignificanceheuristic() {
List<SignificanceHeuristic> heuristics = new ArrayList<>();
heuristics.add(new JLHScore());
heuristics.add(new MutualInformation(randomBoolean(), randomBoolean()));
diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java
index e2625039df..537af74bda 100644
--- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java
+++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorTests.java
@@ -19,23 +19,43 @@
package org.elasticsearch.search.aggregations.bucket.significant;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.index.analysis.AnalyzerScope;
+import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
+import org.elasticsearch.index.mapper.NumberFieldMapper;
+import org.elasticsearch.index.mapper.NumberFieldMapper.NumberFieldType;
+import org.elasticsearch.index.mapper.NumberFieldMapper.NumberType;
+import org.elasticsearch.index.mapper.TextFieldMapper.TextFieldType;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.aggregations.AggregatorFactory;
import org.elasticsearch.search.aggregations.AggregatorTestCase;
+import org.elasticsearch.search.aggregations.bucket.significant.SignificantTermsAggregatorFactory.ExecutionMode;
+import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
import org.elasticsearch.search.aggregations.support.ValueType;
import org.hamcrest.Matchers;
import org.junit.Before;
import java.io.IOException;
+import java.util.List;
public class SignificantTermsAggregatorTests extends AggregatorTestCase {
@@ -71,5 +91,199 @@ public class SignificantTermsAggregatorTests extends AggregatorTestCase {
// be 0
assertEquals(1, ((BooleanQuery) parsedQuery).getMinimumNumberShouldMatch());
}
+
+ /**
+ * Uses the significant terms aggregation to find the keywords in text fields
+ */
+ public void testSignificance() throws IOException {
+ TextFieldType textFieldType = new TextFieldType();
+ textFieldType.setName("text");
+ textFieldType.setFielddata(true);
+ textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
+
+ IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
+ try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
+ addMixedTextDocs(textFieldType, w);
+
+ SignificantTermsAggregationBuilder sigAgg = new SignificantTermsAggregationBuilder("sig_text", null).field("text");
+ sigAgg.executionHint(randomExecutionHint());
+ if (randomBoolean()) {
+ // Use a background filter which just happens to be same scope as whole-index.
+ sigAgg.backgroundFilter(QueryBuilders.termsQuery("text", "common"));
+ }
+
+ SignificantTermsAggregationBuilder sigNumAgg = new SignificantTermsAggregationBuilder("sig_number", null).field("long_field");
+ sigNumAgg.executionHint(randomExecutionHint());
+
+ try (IndexReader reader = DirectoryReader.open(w)) {
+ IndexSearcher searcher = new IndexSearcher(reader);
+
+ // Search "odd"
+ SignificantTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
+
+ assertEquals(1, terms.getBuckets().size());
+ assertNull(terms.getBucketByKey("even"));
+ assertNull(terms.getBucketByKey("common"));
+ assertNotNull(terms.getBucketByKey("odd"));
+
+ // Search even
+ terms = searchAndReduce(searcher, new TermQuery(new Term("text", "even")), sigAgg, textFieldType);
+
+ assertEquals(1, terms.getBuckets().size());
+ assertNull(terms.getBucketByKey("odd"));
+ assertNull(terms.getBucketByKey("common"));
+ assertNotNull(terms.getBucketByKey("even"));
+
+ // Search odd with regex includeexcludes
+ sigAgg.includeExclude(new IncludeExclude("o.d", null));
+ terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
+ assertEquals(1, terms.getBuckets().size());
+ assertNotNull(terms.getBucketByKey("odd"));
+ assertNull(terms.getBucketByKey("common"));
+ assertNull(terms.getBucketByKey("even"));
+
+ // Search with string-based includeexcludes
+ String oddStrings[] = new String[] {"odd", "weird"};
+ String evenStrings[] = new String[] {"even", "regular"};
+
+ sigAgg.includeExclude(new IncludeExclude(oddStrings, evenStrings));
+ sigAgg.significanceHeuristic(SignificanceHeuristicTests.getRandomSignificanceheuristic());
+ terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
+ assertEquals(1, terms.getBuckets().size());
+ assertNotNull(terms.getBucketByKey("odd"));
+ assertNull(terms.getBucketByKey("weird"));
+ assertNull(terms.getBucketByKey("common"));
+ assertNull(terms.getBucketByKey("even"));
+ assertNull(terms.getBucketByKey("regular"));
+
+ sigAgg.includeExclude(new IncludeExclude(evenStrings, oddStrings));
+ terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
+ assertEquals(0, terms.getBuckets().size());
+ assertNull(terms.getBucketByKey("odd"));
+ assertNull(terms.getBucketByKey("weird"));
+ assertNull(terms.getBucketByKey("common"));
+ assertNull(terms.getBucketByKey("even"));
+ assertNull(terms.getBucketByKey("regular"));
+
+ }
+ }
+ }
+
+ /**
+ * Uses the significant terms aggregation to find the keywords in numeric
+ * fields
+ */
+ public void testNumericSignificance() throws IOException {
+ NumberFieldType longFieldType = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.LONG);
+ longFieldType.setName("long_field");
+
+ TextFieldType textFieldType = new TextFieldType();
+ textFieldType.setName("text");
+ textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
+
+ IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
+ final long ODD_VALUE = 3;
+ final long EVEN_VALUE = 6;
+ final long COMMON_VALUE = 2;
+
+ try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
+
+ for (int i = 0; i < 10; i++) {
+ Document doc = new Document();
+ if (i % 2 == 0) {
+ addFields(doc, NumberType.LONG.createFields("long_field", ODD_VALUE, true, true, false));
+ doc.add(new Field("text", "odd", textFieldType));
+ } else {
+ addFields(doc, NumberType.LONG.createFields("long_field", EVEN_VALUE, true, true, false));
+ doc.add(new Field("text", "even", textFieldType));
+ }
+ addFields(doc, NumberType.LONG.createFields("long_field", COMMON_VALUE, true, true, false));
+ w.addDocument(doc);
+ }
+
+ SignificantTermsAggregationBuilder sigNumAgg = new SignificantTermsAggregationBuilder("sig_number", null).field("long_field");
+ sigNumAgg.executionHint(randomExecutionHint());
+
+ try (IndexReader reader = DirectoryReader.open(w)) {
+ IndexSearcher searcher = new IndexSearcher(reader);
+
+ // Search "odd"
+ SignificantLongTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigNumAgg, longFieldType);
+ assertEquals(1, terms.getBuckets().size());
+
+ assertNull(terms.getBucketByKey(Long.toString(EVEN_VALUE)));
+ assertNull(terms.getBucketByKey(Long.toString(COMMON_VALUE)));
+ assertNotNull(terms.getBucketByKey(Long.toString(ODD_VALUE)));
+
+ terms = searchAndReduce(searcher, new TermQuery(new Term("text", "even")), sigNumAgg, longFieldType);
+ assertEquals(1, terms.getBuckets().size());
+
+ assertNull(terms.getBucketByKey(Long.toString(ODD_VALUE)));
+ assertNull(terms.getBucketByKey(Long.toString(COMMON_VALUE)));
+ assertNotNull(terms.getBucketByKey(Long.toString(EVEN_VALUE)));
+
+ }
+ }
+ }
+
+ /**
+ * Uses the significant terms aggregation on an index with unmapped field
+ */
+ public void testUnmapped() throws IOException {
+ TextFieldType textFieldType = new TextFieldType();
+ textFieldType.setName("text");
+ textFieldType.setFielddata(true);
+ textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer()));
+
+ IndexWriterConfig indexWriterConfig = newIndexWriterConfig();
+ try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) {
+ addMixedTextDocs(textFieldType, w);
+
+ // Attempt aggregation on unmapped field
+ SignificantTermsAggregationBuilder sigAgg = new SignificantTermsAggregationBuilder("sig_text", null).field("unmapped_field");
+ sigAgg.executionHint(randomExecutionHint());
+
+ try (IndexReader reader = DirectoryReader.open(w)) {
+ IndexSearcher searcher = new IndexSearcher(reader);
+
+ // Search "odd"
+ SignificantTerms terms = searchAndReduce(searcher, new TermQuery(new Term("text", "odd")), sigAgg, textFieldType);
+ assertEquals(0, terms.getBuckets().size());
+
+ assertNull(terms.getBucketByKey("even"));
+ assertNull(terms.getBucketByKey("common"));
+ assertNull(terms.getBucketByKey("odd"));
+
+ }
+ }
+ }
+
+ private void addMixedTextDocs(TextFieldType textFieldType, IndexWriter w) throws IOException {
+ for (int i = 0; i < 10; i++) {
+ Document doc = new Document();
+ StringBuilder text = new StringBuilder("common ");
+ if (i % 2 == 0) {
+ text.append("odd ");
+ } else {
+ text.append("even ");
+ }
+
+ doc.add(new Field("text", text.toString(), textFieldType));
+ String json = "{ \"text\" : \"" + text.toString() + "\" }";
+ doc.add(new StoredField("_source", new BytesRef(json)));
+
+ w.addDocument(doc);
+ }
+ }
+
+ private void addFields(Document doc, List<Field> createFields) {
+ for (Field field : createFields) {
+ doc.add(field);
+ }
+ }
+
+ public String randomExecutionHint() {
+ return randomBoolean() ? null : randomFrom(ExecutionMode.values()).toString();
+ }
}