summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKelley Spoon <kelley.spoon@linaro.org>2021-09-30 07:34:16 -0500
committerKelley Spoon <kelley.spoon@linaro.org>2021-09-30 07:34:16 -0500
commit546a9b6f3b53925b23ed3ac87d36c04a913669c9 (patch)
treef0d8b179ab1eff3f640a1275dc52c6884cff6129
parent7ab36b94fd1418ded48710c0e917e6f9454becd9 (diff)
es-scripts: add new scripts to support elasticsearch stability
This change adds in some more advanced scripts meant to help improve our elasticsearch uptime. All scripts are python3 and will require virtualenv to run. Change-Id: I6758b16b5f7fc83f33faa706423785fd07cc59c5
-rwxr-xr-xcheck-query.py26
-rwxr-xr-xpreseed-beats.py45
-rwxr-xr-xstatus-autoclear.py42
3 files changed, 113 insertions, 0 deletions
diff --git a/check-query.py b/check-query.py
new file mode 100755
index 0000000..b97c7ca
--- /dev/null
+++ b/check-query.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+# This script checks on the status of the cluster
+# by running a query and ensuring it returns
+# a result in a timely manner.
+
+from elasticsearch import Elasticsearch
+
+import sys
+
+ES_HOST="elasticsearch-production"
+ES_PORT=9200
+CHECK_INDEX="elastalert_status"
+BODY={
+ "query": {
+ "match_all": {}
+ }
+}
+
+es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}])
+
+# we don't really do anything with the results, just
+# want to see if it hits the timeout
+results = es.search(index=CHECK_INDEX, body=BODY, timeout='30s')
+
+sys.exit(0)
diff --git a/preseed-beats.py b/preseed-beats.py
new file mode 100755
index 0000000..3dadbf7
--- /dev/null
+++ b/preseed-beats.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+
+# Short script to preseed the elasticsearch indices used by
+# the *beats programs. We have had problems with the ES
+# cluster kicking over into a failed (red) state because
+# of a timeout while trying to create the new daily index
+# for random beats.
+#
+# This script aims to remove some of the pressure on the
+# cluster by having an idempotent way of creating the
+# indices before they are needed so that beats can
+# just start dumping data as soon as the date changes
+
+from elasticsearch import Elasticsearch
+import requests
+import json
+from datetime import datetime,timedelta
+
+import sys
+
+ES_HOST="elasticsearch-production"
+ES_PORT=9200
+
+BEATS_INDICES = [
+ 'metricbeat-lavalab',
+ 'metricbeat-systems',
+ 'filebeat-systems',
+ 'heartbeat'
+]
+
+es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}])
+
+today = datetime.utcnow()
+tom = today + timedelta(days=1)
+timestamp = tom.strftime('%Y.%m.%d')
+
+for prefix in BEATS_INDICES:
+ index = '{0}-{1}'.format(prefix, timestamp)
+ # Do not try to catch the ConnectionTimeout
+ # exception here as we want cron to generate
+ # email if there's a problem.
+ es.indices.create(index=index, ignore=400)
+
+# if still alive, exit 0 to keep cron from chirping
+sys.exit(0)
diff --git a/status-autoclear.py b/status-autoclear.py
new file mode 100755
index 0000000..23b88ff
--- /dev/null
+++ b/status-autoclear.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+
+# This script checks on the status of the cluster
+# and if it's in a failed state due to a shard
+# collision on allocation for the creation of
+# an index, it will automatically clear the
+# index.
+
+from elasticsearch import Elasticsearch
+from elasticsearch.exceptions import RequestError
+import requests
+import json
+from datetime import datetime,timedelta
+
+import sys
+
+ES_HOST="elasticsearch-production"
+ES_PORT=9200
+
+es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}])
+
+health = es.cluster.health()
+
+if health['status'] == 'red':
+ try:
+ alloc = es.cluster.allocation_explain()
+ # ferret out the stuck index and delete it
+ troublemaker = alloc["index"]
+ if alloc['unassigned_info']['reason'] == "ALLOCATION_FAILED":
+ # let's try to remove it
+ es.indices.delete(index=troublemaker, timeout=60)
+ else:
+ # haven't seen it before... dump error and get a human
+ print(alloc)
+ sys.exit(1)
+ except RequestError as re:
+ # something we don't know how to handle... get a human
+ raise re
+ sys.exit(1)
+
+# either not red, or the delete worked
+sys.exit(0)