diff options
author | Kelley Spoon <kelley.spoon@linaro.org> | 2021-09-30 07:34:16 -0500 |
---|---|---|
committer | Kelley Spoon <kelley.spoon@linaro.org> | 2021-09-30 07:34:16 -0500 |
commit | 546a9b6f3b53925b23ed3ac87d36c04a913669c9 (patch) | |
tree | f0d8b179ab1eff3f640a1275dc52c6884cff6129 | |
parent | 7ab36b94fd1418ded48710c0e917e6f9454becd9 (diff) |
es-scripts: add new scripts to support elasticsearch stability
This change adds in some more advanced scripts meant
to help improve our elasticsearch uptime. All scripts
are python3 and will require virtualenv to run.
Change-Id: I6758b16b5f7fc83f33faa706423785fd07cc59c5
-rwxr-xr-x | check-query.py | 26 | ||||
-rwxr-xr-x | preseed-beats.py | 45 | ||||
-rwxr-xr-x | status-autoclear.py | 42 |
3 files changed, 113 insertions, 0 deletions
diff --git a/check-query.py b/check-query.py new file mode 100755 index 0000000..b97c7ca --- /dev/null +++ b/check-query.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# This script checks on the status of the cluster +# by running a query and ensuring it returns +# a result in a timely manner. + +from elasticsearch import Elasticsearch + +import sys + +ES_HOST="elasticsearch-production" +ES_PORT=9200 +CHECK_INDEX="elastalert_status" +BODY={ + "query": { + "match_all": {} + } +} + +es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}]) + +# we don't really do anything with the results, just +# want to see if it hits the timeout +results = es.search(index=CHECK_INDEX, body=BODY, timeout='30s') + +sys.exit(0) diff --git a/preseed-beats.py b/preseed-beats.py new file mode 100755 index 0000000..3dadbf7 --- /dev/null +++ b/preseed-beats.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 + +# Short script to preseed the elasticsearch indices used by +# the *beats programs. We have had problems with the ES +# cluster kicking over into a failed (red) state because +# of a timeout while trying to create the new daily index +# for random beats. +# +# This script aims to remove some of the pressure on the +# cluster by having an idempotent way of creating the +# indices before they are needed so that beats can +# just start dumping data as soon as the date changes + +from elasticsearch import Elasticsearch +import requests +import json +from datetime import datetime,timedelta + +import sys + +ES_HOST="elasticsearch-production" +ES_PORT=9200 + +BEATS_INDICES = [ + 'metricbeat-lavalab', + 'metricbeat-systems', + 'filebeat-systems', + 'heartbeat' +] + +es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}]) + +today = datetime.utcnow() +tom = today + timedelta(days=1) +timestamp = tom.strftime('%Y.%m.%d') + +for prefix in BEATS_INDICES: + index = '{0}-{1}'.format(prefix, timestamp) + # Do not try to catch the ConnectionTimeout + # exception here as we want cron to generate + # email if there's a problem. + es.indices.create(index=index, ignore=400) + +# if still alive, exit 0 to keep cron from chirping +sys.exit(0) diff --git a/status-autoclear.py b/status-autoclear.py new file mode 100755 index 0000000..23b88ff --- /dev/null +++ b/status-autoclear.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +# This script checks on the status of the cluster +# and if it's in a failed state due to a shard +# collision on allocation for the creation of +# an index, it will automatically clear the +# index. + +from elasticsearch import Elasticsearch +from elasticsearch.exceptions import RequestError +import requests +import json +from datetime import datetime,timedelta + +import sys + +ES_HOST="elasticsearch-production" +ES_PORT=9200 + +es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}]) + +health = es.cluster.health() + +if health['status'] == 'red': + try: + alloc = es.cluster.allocation_explain() + # ferret out the stuck index and delete it + troublemaker = alloc["index"] + if alloc['unassigned_info']['reason'] == "ALLOCATION_FAILED": + # let's try to remove it + es.indices.delete(index=troublemaker, timeout=60) + else: + # haven't seen it before... dump error and get a human + print(alloc) + sys.exit(1) + except RequestError as re: + # something we don't know how to handle... get a human + raise re + sys.exit(1) + +# either not red, or the delete worked +sys.exit(0) |