SignificantText aggregation - like significant_terms, but for text (#24432)

* SignificantText aggregation - like significant_terms but doesn’t require fielddata=true, recommended used with `sampler` agg to limit expense of tokenizing docs and takes optional `filter_duplicate_text`:true setting to avoid stats skew from repeated sections of text in search results. Closes #23674
author: markharwood <markharwood@gmail.com> 2017-05-24 13:46:43 +0100
committer: GitHub <noreply@github.com> 2017-05-24 13:46:43 +0100
commit: b7197f5e2104e3d67fcd2233264ba39dc4058544 (patch)
tree: d829baccc590269a8e4a499aff907c52db5352c2 /rest-api-spec/src
parent: b5adb3cce9917dda22135b14778fb38cfcc0d7cb (diff)
1 files changed, 166 insertions, 0 deletions
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml
new file mode 100644
index 0000000000..f31297c960
--- /dev/null
+++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/90_sig_text.yml
@@ -0,0 +1,166 @@
+---
+"Default index":
+  - do:
+      indices.create:
+          index:  goodbad
+          body:
+            settings:
+                number_of_shards: "1"
+            mappings:
+                doc:
+                    properties:
+                        text:
+                            type: text
+                            fielddata: false
+                        class:
+                            type: keyword
+
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     1
+          body:   { text: "good", class: "good" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     2
+          body:   { text: "good", class: "good" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     3
+          body:   { text: "bad", class: "bad" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     4
+          body:   { text: "bad", class: "bad" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     5
+          body:   { text: "good bad", class: "good" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     6
+          body:   { text: "good bad", class: "bad" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     7
+          body:   { text: "bad", class: "bad" }
+
+
+
+  - do:
+      indices.refresh:
+        index: [goodbad]
+
+  - do:
+      search:
+        index: goodbad
+        type:  doc
+
+  - match: {hits.total: 7}
+  
+  - do:
+      search:
+        index: goodbad
+        type:  doc
+        body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text"}}}}}}
+
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"}
+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"}
+  
+---
+"Dedup noise":
+  - do:
+      indices.create:
+          index:  goodbad
+          body:
+            settings:
+                number_of_shards: "1"
+            mappings:
+                doc:
+                    properties:
+                        text:
+                            type: text
+                            fielddata: false
+                        class:
+                            type: keyword
+
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     1
+          body:   { text: "good noisewords1 g1 g2 g3 g4 g5 g6", class: "good" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     2
+          body:   { text: "good  noisewords2 g1 g2 g3 g4 g5 g6", class: "good" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     3
+          body:   { text: "bad noisewords3 b1 b2 b3 b4 b5 b6", class: "bad" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     4
+          body:   { text: "bad noisewords4 b1 b2 b3 b4 b5 b6", class: "bad" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     5
+          body:   { text: "good bad noisewords5 gb1 gb2 gb3 gb4 gb5 gb6", class: "good" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     6
+          body:   { text: "good bad noisewords6 gb1 gb2 gb3 gb4 gb5 gb6", class: "bad" }
+  - do:
+      index:
+          index:  goodbad
+          type:   doc
+          id:     7
+          body:   { text: "bad noisewords7 b1 b2 b3 b4 b5 b6", class: "bad" }
+
+
+
+  - do:
+      indices.refresh:
+        index: [goodbad]
+
+  - do:
+      search:
+        index: goodbad
+        type:  doc
+
+  - match: {hits.total: 7}
+  
+  - do:
+      search:
+        index: goodbad
+        type:  doc
+        body: {"aggs": {"class": {"terms": {"field": "class"},"aggs": {"sig_text": {"significant_text": {"field": "text", "filter_duplicate_text": true}}}}}}
+
+  - match: {aggregations.class.buckets.0.sig_text.buckets.0.key: "bad"}
+  - length: { aggregations.class.buckets.0.sig_text.buckets: 1 }  
+  - match: {aggregations.class.buckets.1.sig_text.buckets.0.key: "good"}
+  - length: { aggregations.class.buckets.1.sig_text.buckets: 1 }  
+
author	markharwood <markharwood@gmail.com>	2017-05-24 13:46:43 +0100
committer	GitHub <noreply@github.com>	2017-05-24 13:46:43 +0100
commit	b7197f5e2104e3d67fcd2233264ba39dc4058544 (patch)
tree	d829baccc590269a8e4a499aff907c52db5352c2 /rest-api-spec/src
parent	b5adb3cce9917dda22135b14778fb38cfcc0d7cb (diff)