summaryrefslogtreecommitdiff
path: root/rest-api-spec/src
diff options
context:
space:
mode:
authormarkharwood <markharwood@gmail.com>2017-05-24 13:46:43 +0100
committerGitHub <noreply@github.com>2017-05-24 13:46:43 +0100
commitb7197f5e2104e3d67fcd2233264ba39dc4058544 (patch)
treed829baccc590269a8e4a499aff907c52db5352c2 /rest-api-spec/src
parentb5adb3cce9917dda22135b14778fb38cfcc0d7cb (diff)
SignificantText aggregation - like significant_terms, but for text (#24432)
* SignificantText aggregation - like significant_terms but doesn’t require fielddata=true, recommended used with `sampler` agg to limit expense of tokenizing docs and takes optional `filter_duplicate_text`:true setting to avoid stats skew from repeated sections of text in search results. Closes #23674
Diffstat (limited to 'rest-api-spec/src')
-rw-r--r--rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml166
1 files changed, 166 insertions, 0 deletions
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml
new file mode 100644
index 0000000000..f31297c960
--- /dev/null
+++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml
@@ -0,0 +1,166 @@
+---
+"Default index":
+ - do:
+ indices.create:
+ index: goodbad
+ body:
+ settings:
+ number_of_shards: "1"
+ mappings:
+ doc:
+ properties:
+ text:
+ type: text
+ fielddata: false
+ class:
+ type: keyword
+
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 1
+ body: { text: "good", class: "good" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 2
+ body: { text: "good", class: "good" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 3
+ body: { text: "bad", class: "bad" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 4
+ body: { text: "bad", class: "bad" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 5
+ body: { text: "good bad", class: "good" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 6
+ body: { text: "good bad", class: "bad" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 7
+ body: { text: "bad", class: "bad" }
+
+
+
+ - do:
+ indices.refresh:
+ index: [goodbad]
+
+ - do:
+ search:
+ index: goodbad
+ type: doc
+
+ - match: {hits.total: 7}
+
+ - do:
+ search:
+ index: goodbad
+ type: doc
+ body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text"}}}}}}
+
+ - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"}
+ - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"}
+
+---
+"Dedup noise":
+ - do:
+ indices.create:
+ index: goodbad
+ body:
+ settings:
+ number_of_shards: "1"
+ mappings:
+ doc:
+ properties:
+ text:
+ type: text
+ fielddata: false
+ class:
+ type: keyword
+
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 1
+ body: { text: "good noisewords1 g1 g2 g3 g4 g5 g6", class: "good" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 2
+ body: { text: "good noisewords2 g1 g2 g3 g4 g5 g6", class: "good" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 3
+ body: { text: "bad noisewords3 b1 b2 b3 b4 b5 b6", class: "bad" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 4
+ body: { text: "bad noisewords4 b1 b2 b3 b4 b5 b6", class: "bad" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 5
+ body: { text: "good bad noisewords5 gb1 gb2 gb3 gb4 gb5 gb6", class: "good" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 6
+ body: { text: "good bad noisewords6 gb1 gb2 gb3 gb4 gb5 gb6", class: "bad" }
+ - do:
+ index:
+ index: goodbad
+ type: doc
+ id: 7
+ body: { text: "bad noisewords7 b1 b2 b3 b4 b5 b6", class: "bad" }
+
+
+
+ - do:
+ indices.refresh:
+ index: [goodbad]
+
+ - do:
+ search:
+ index: goodbad
+ type: doc
+
+ - match: {hits.total: 7}
+
+ - do:
+ search:
+ index: goodbad
+ type: doc
+ body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text", "filter_duplicate_text": true}}}}}}
+
+ - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"}
+ - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }
+ - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"}
+ - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }
+