From 77f93c712015d86f2e51870e23db91234e906e7c Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Thu, 16 Jan 2020 15:36:16 +0100 Subject: [PATCH 01/22] Add script for checking running Monasca health Signed-off-by: Dobroslaw Zybort --- cmm-check-health.py | 297 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 cmm-check-health.py diff --git a/cmm-check-health.py b/cmm-check-health.py new file mode 100644 index 000000000..2c97f340c --- /dev/null +++ b/cmm-check-health.py @@ -0,0 +1,297 @@ +#!/usr/bin/env python3 + +import json +import subprocess +import sys + +from time import localtime, gmtime, strftime + +# Run this script only with Python 3 +if sys.version_info.major != 3: + sys.stdout.write("Sorry, requires Python 3.x\n") + sys.exit(1) + +print("Running simple tests of running Monasca services") +print("Local time {}".format(strftime("%Y-%m-%d %H:%M:%S", localtime()))) +print("UTC time {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()))) + +docker_exec = ["docker-compose", + "-f", "docker-compose-metric.yml", + "-f", "docker-compose-log.yml", + "exec"] + + +def print_info(service_name, test_function): + CGREEN = '\033[92m' + CRED = '\033[91m' + CEND = '\033[0m' + if test_function != 0: + print(f"\n{CRED}❌{CEND} There is problem with {service_name}\n") + else: + print(f"{CGREEN}✔{CEND} {service_name} is fine") + + +############################################################################### +# +# Metrics services +# +############################################################################### + +def test_memcached(): + try: + resp = subprocess.run(docker_exec + ["memcached", "ash", "-c", "echo stats | nc -w 1 127.0.0.1 11211"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, check=True).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if "get_hits" not in resp: + print("There is problem with Memcached") + return 2 + + return 0 + +def test_influxdb(): + try: + dbs = subprocess.run(docker_exec + ["influxdb", "influx", "-execute", "SHOW DATABASES"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, check=True).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if "mon" not in dbs: + print("Database 'mon' was not found in InfluxDB") + return 2 + + return 0 + + +def test_cadvisor(): + try: + resp = subprocess.run(docker_exec + ["cadvisor", "wget", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, check=True).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if "200 OK" not in resp: + print("cAdvisor did not returned properly") + return 2 + + return 0 + + +def test_zookeeper(): + try: + resp = subprocess.run(docker_exec + ["zookeeper", "bash", "-c", "echo mntr | nc -w 1 127.0.0.1 2181"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, check=True).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if "zk_avg_latency" not in resp: + print("Zookeeper did not returned properly") + return 2 + + return 0 + + +def test_kafka(): + try: + resp = subprocess.run( + docker_exec + ["kafka", "ash", "-c", + "kafka-topics.sh --list --zookeeper zookeeper:2181"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True, check=True + ).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if "metrics" not in resp: + print("'metrics' not found in Kafka topics") + return 2 + + return 0 + + +def test_mysql(): + mysql_conn = "MYSQL_PWD=secretmysql mysql --silent --skip-column-names " + + try: + resp = subprocess.run( + docker_exec + ["mysql", "bash", "-c", mysql_conn + "-e 'show databases;'"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True, check=True + ).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if "mon" not in resp: + print("'mon' database not found in MySQL") + return 2 + + try: + max_conn = subprocess.run( + docker_exec + ["mysql", "bash", "-c", mysql_conn + "-e 'select @@max_connections;'"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True, check=True + ).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + try: + conn = subprocess.run( + docker_exec + ["mysql", "bash", "-c", mysql_conn + + "-e 'SHOW STATUS WHERE `variable_name` = \"Threads_connected\";' | cut -f2"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True, check=True + ).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if int(conn) == int(max_conn): + print("MySQL database is using all available connections") + return 3 + + if int(conn) == 0: + print("No one is connecting to MySQL database, is metrics API working properly?") + return 4 + + return 0 + + +def test_monasca(): + try: + resp = subprocess.run( + docker_exec + ["monasca", "ash", "-c", + "curl http://localhost:8070/healthcheck"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True, check=True + ).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + jresp = json.loads(resp) + if jresp["error"]["title"] != "Unauthorized": + print("Monasca API did not returned properly") + return 2 + + return 0 + + +def test_grafana(): + try: + resp = subprocess.run( + docker_exec + ["grafana", "ash", "-c", + "wget -qO- http://localhost:3000/api/health"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True, check=True + ).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if "database" not in resp: + print("Grafana did not returned properly") + return 2 + + jresp = json.loads(resp) + if jresp["database"] != "ok": + print(f"Grafana reported problem with database: {jresp['database']}") + return 3 + + return 0 + + +############################################################################### +# +# Logs services +# +############################################################################### + +def test_elasticsearch(): + try: + resp = subprocess.run( + docker_exec + ["elasticsearch", "ash", "-c", + "curl -XGET 'localhost:9200/_cluster/health?pretty'"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True, check=True + ).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if "monasca" not in resp: + print("Elasticsearch did not have 'monasca' cluster") + return 2 + + jresp = json.loads(resp) + if jresp["status"] == "red": + print("Elasticsearch health check reports problem with cluster") + return 2 + + return 0 + + +def test_elasticsearch_curator(): + try: + resp = subprocess.run( + docker_exec + ["elasticsearch-curator", "ash", "-c", + "curator --dry-run --config /config.yml /action.yml"], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + universal_newlines=True, check=True + ).stdout + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + if "delete_indices" not in resp: + print("Elasticsearch Curator did not run properly") + return 2 + + return 0 + + +# Metrics services +print_info("Memcached", test_memcached()) +print_info("InfluxDB", test_influxdb()) +print_info("cAdvisor", test_cadvisor()) +# print_info("Monasca Agent Forwarder", test_agent_forwarder()) +# print_info("Monasca Agent Collector", test_agent-collector()) +print_info("Zookeeper", test_zookeeper()) +print_info("Kafka", test_kafka()) +print_info("MySQL", test_mysql()) +print_info("Monasca API", test_monasca()) +# print_info("Monasca Persister", test_monasca_persister()) +# print_info("Monasca Thresh", test_thresh()) +# print_info("Monasca Notification", test_monasca_notification()) +print_info("Grafana", test_grafana()) + + +# Logs services +# print_info("Monasca Log Metrics", test_log_metrics()) +# print_info("Monasca Log Persister", test_log_persister()) +# print_info("Monasca Log Transformer", test_log_transformer()) +print_info("Elasticsearch", test_elasticsearch()) +print_info("Elasticsearch Curator", test_elasticsearch_curator()) +# print_info("Kibana", test_kibana()) +# print_info("Monasca Log API", test_log_api()) +# print_info("Monasca Log Agent", test_log_agent()) +# print_info("Monasca Logspout", test_logspout()) From 73c7f9830c7ff7dfca392290afaefeeecf9704e6 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Tue, 4 Feb 2020 15:39:23 +0100 Subject: [PATCH 02/22] Make Python2 compatible Signed-off-by: Dobroslaw Zybort --- cmm-check-health.py | 138 +++++++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 66 deletions(-) diff --git a/cmm-check-health.py b/cmm-check-health.py index 2c97f340c..0dacd7b4c 100644 --- a/cmm-check-health.py +++ b/cmm-check-health.py @@ -1,4 +1,7 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +# coding=utf-8 + +from __future__ import print_function import json import subprocess @@ -6,10 +9,6 @@ from time import localtime, gmtime, strftime -# Run this script only with Python 3 -if sys.version_info.major != 3: - sys.stdout.write("Sorry, requires Python 3.x\n") - sys.exit(1) print("Running simple tests of running Monasca services") print("Local time {}".format(strftime("%Y-%m-%d %H:%M:%S", localtime()))) @@ -26,9 +25,9 @@ def print_info(service_name, test_function): CRED = '\033[91m' CEND = '\033[0m' if test_function != 0: - print(f"\n{CRED}❌{CEND} There is problem with {service_name}\n") + print("\n{}❌{} There is problem with {}\n".format(CRED, CEND, service_name)) else: - print(f"{CGREEN}✔{CEND} {service_name} is fine") + print("{}✔{} {} is fine".format(CGREEN, CEND, service_name)) ############################################################################### @@ -39,8 +38,11 @@ def print_info(service_name, test_function): def test_memcached(): try: - resp = subprocess.run(docker_exec + ["memcached", "ash", "-c", "echo stats | nc -w 1 127.0.0.1 11211"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, check=True).stdout + resp = subprocess.check_output( + docker_exec + ["memcached", + "ash", "-c", "echo stats | nc -w 1 127.0.0.1 11211"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) @@ -54,8 +56,11 @@ def test_memcached(): def test_influxdb(): try: - dbs = subprocess.run(docker_exec + ["influxdb", "influx", "-execute", "SHOW DATABASES"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, check=True).stdout + dbs = subprocess.check_output( + docker_exec + ["influxdb", + "influx", "-execute", "SHOW DATABASES"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) @@ -70,15 +75,18 @@ def test_influxdb(): def test_cadvisor(): try: - resp = subprocess.run(docker_exec + ["cadvisor", "wget", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, check=True).stdout + resp = subprocess.check_output( + docker_exec + ["cadvisor", + "wget", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) return 1 if "200 OK" not in resp: - print("cAdvisor did not returned properly") + print("cAdvisor did not return properly") return 2 return 0 @@ -86,15 +94,18 @@ def test_cadvisor(): def test_zookeeper(): try: - resp = subprocess.run(docker_exec + ["zookeeper", "bash", "-c", "echo mntr | nc -w 1 127.0.0.1 2181"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, check=True).stdout + resp = subprocess.check_output( + docker_exec + ["zookeeper", + "bash", "-c", "echo mntr | nc -w 1 127.0.0.1 2181"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) return 1 if "zk_avg_latency" not in resp: - print("Zookeeper did not returned properly") + print("Zookeeper did not return properly") return 2 return 0 @@ -102,12 +113,11 @@ def test_zookeeper(): def test_kafka(): try: - resp = subprocess.run( - docker_exec + ["kafka", "ash", "-c", - "kafka-topics.sh --list --zookeeper zookeeper:2181"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True, check=True - ).stdout + resp = subprocess.check_output( + docker_exec + ["kafka", + "ash", "-c", "kafka-topics.sh --list --zookeeper zookeeper:2181"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) @@ -124,11 +134,11 @@ def test_mysql(): mysql_conn = "MYSQL_PWD=secretmysql mysql --silent --skip-column-names " try: - resp = subprocess.run( - docker_exec + ["mysql", "bash", "-c", mysql_conn + "-e 'show databases;'"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True, check=True - ).stdout + resp = subprocess.check_output( + docker_exec + ["mysql", + "bash", "-c", mysql_conn + "-e 'show databases;'"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) @@ -139,23 +149,23 @@ def test_mysql(): return 2 try: - max_conn = subprocess.run( - docker_exec + ["mysql", "bash", "-c", mysql_conn + "-e 'select @@max_connections;'"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True, check=True - ).stdout + max_conn = subprocess.check_output( + docker_exec + ["mysql", + "bash", "-c", mysql_conn + "-e 'select @@max_connections;'"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) return 1 try: - conn = subprocess.run( - docker_exec + ["mysql", "bash", "-c", mysql_conn + - "-e 'SHOW STATUS WHERE `variable_name` = \"Threads_connected\";' | cut -f2"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True, check=True - ).stdout + conn = subprocess.check_output( + docker_exec + ["mysql", + "bash", "-c", mysql_conn + + "-e 'SHOW STATUS WHERE `variable_name` = \"Threads_connected\";' | cut -f2"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) @@ -174,12 +184,11 @@ def test_mysql(): def test_monasca(): try: - resp = subprocess.run( - docker_exec + ["monasca", "ash", "-c", - "curl http://localhost:8070/healthcheck"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True, check=True - ).stdout + resp = subprocess.check_output( + docker_exec + ["monasca", + "ash", "-c", "curl http://localhost:8070/healthcheck"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) @@ -187,7 +196,7 @@ def test_monasca(): jresp = json.loads(resp) if jresp["error"]["title"] != "Unauthorized": - print("Monasca API did not returned properly") + print("Monasca API did not return properly") return 2 return 0 @@ -195,24 +204,23 @@ def test_monasca(): def test_grafana(): try: - resp = subprocess.run( - docker_exec + ["grafana", "ash", "-c", - "wget -qO- http://localhost:3000/api/health"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True, check=True - ).stdout + resp = subprocess.check_output( + docker_exec + ["grafana", + "ash", "-c", "wget -qO- http://localhost:3000/api/health"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) return 1 if "database" not in resp: - print("Grafana did not returned properly") + print("Grafana did not return properly") return 2 jresp = json.loads(resp) if jresp["database"] != "ok": - print(f"Grafana reported problem with database: {jresp['database']}") + print("Grafana reported problem with database: {}".format(jresp['database'])) return 3 return 0 @@ -226,12 +234,11 @@ def test_grafana(): def test_elasticsearch(): try: - resp = subprocess.run( - docker_exec + ["elasticsearch", "ash", "-c", - "curl -XGET 'localhost:9200/_cluster/health?pretty'"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True, check=True - ).stdout + resp = subprocess.check_output( + docker_exec + ["elasticsearch", + "ash", "-c", "curl -XGET 'localhost:9200/_cluster/health?pretty'"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) @@ -251,12 +258,11 @@ def test_elasticsearch(): def test_elasticsearch_curator(): try: - resp = subprocess.run( - docker_exec + ["elasticsearch-curator", "ash", "-c", - "curator --dry-run --config /config.yml /action.yml"], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True, check=True - ).stdout + resp = subprocess.check_output( + docker_exec + ["elasticsearch-curator", + "ash", "-c", "curator --dry-run --config /config.yml /action.yml"], + stderr=subprocess.STDOUT, universal_newlines=True + ) except subprocess.CalledProcessError as exc: print(exc.output) print(exc) From 7359100d183e6eacd8d5675fc1eb291372e840b5 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Tue, 4 Feb 2020 15:51:41 +0100 Subject: [PATCH 03/22] Don't use MySQL password in plain text Signed-off-by: Dobroslaw Zybort --- cmm-check-health.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmm-check-health.py b/cmm-check-health.py index 0dacd7b4c..b4e07cdc9 100644 --- a/cmm-check-health.py +++ b/cmm-check-health.py @@ -131,7 +131,7 @@ def test_kafka(): def test_mysql(): - mysql_conn = "MYSQL_PWD=secretmysql mysql --silent --skip-column-names " + mysql_conn = "MYSQL_PWD=${MYSQL_ROOT_PASSWORD} mysql --silent --skip-column-names " try: resp = subprocess.check_output( From 36b40e31aa4e4a0679b82fdfd5485e6cd0b52da8 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Tue, 11 Feb 2020 17:40:36 +0100 Subject: [PATCH 04/22] Test Kafka lags Some more fixes to script output. Signed-off-by: Dobroslaw Zybort --- cmm-check-health.py | 106 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 90 insertions(+), 16 deletions(-) diff --git a/cmm-check-health.py b/cmm-check-health.py index b4e07cdc9..502d2e0b3 100644 --- a/cmm-check-health.py +++ b/cmm-check-health.py @@ -3,23 +3,40 @@ from __future__ import print_function +import csv import json import subprocess import sys from time import localtime, gmtime, strftime +############################################################################### +# +# Global values +# +############################################################################### -print("Running simple tests of running Monasca services") -print("Local time {}".format(strftime("%Y-%m-%d %H:%M:%S", localtime()))) -print("UTC time {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()))) +# Report warning when Kafka lag jump over this value +KAFKA_PROBLEM_LAG = 20000 -docker_exec = ["docker-compose", +# String for using docker-compose to exec commands in all services +DOCKER_EXEC = ["docker-compose", "-f", "docker-compose-metric.yml", "-f", "docker-compose-log.yml", "exec"] +print("Running simple tests of running Monasca services") +print("Local time {}".format(strftime("%Y-%m-%d %H:%M:%S", localtime()))) +print("UTC time {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()))) + +def check_print(func): + def func_wrapper(): + # Print func name with "test_" stripped + print("Checking '{}'".format(func.__name__[5:])) + return func() + return func_wrapper + def print_info(service_name, test_function): CGREEN = '\033[92m' CRED = '\033[91m' @@ -36,10 +53,11 @@ def print_info(service_name, test_function): # ############################################################################### +@check_print def test_memcached(): try: resp = subprocess.check_output( - docker_exec + ["memcached", + DOCKER_EXEC + ["memcached", "ash", "-c", "echo stats | nc -w 1 127.0.0.1 11211"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -54,10 +72,12 @@ def test_memcached(): return 0 + +@check_print def test_influxdb(): try: dbs = subprocess.check_output( - docker_exec + ["influxdb", + DOCKER_EXEC + ["influxdb", "influx", "-execute", "SHOW DATABASES"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -73,10 +93,11 @@ def test_influxdb(): return 0 +@check_print def test_cadvisor(): try: resp = subprocess.check_output( - docker_exec + ["cadvisor", + DOCKER_EXEC + ["cadvisor", "wget", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -92,10 +113,11 @@ def test_cadvisor(): return 0 +@check_print def test_zookeeper(): try: resp = subprocess.check_output( - docker_exec + ["zookeeper", + DOCKER_EXEC + ["zookeeper", "bash", "-c", "echo mntr | nc -w 1 127.0.0.1 2181"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -111,10 +133,11 @@ def test_zookeeper(): return 0 +@check_print def test_kafka(): try: resp = subprocess.check_output( - docker_exec + ["kafka", + DOCKER_EXEC + ["kafka", "ash", "-c", "kafka-topics.sh --list --zookeeper zookeeper:2181"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -127,15 +150,62 @@ def test_kafka(): print("'metrics' not found in Kafka topics") return 2 + cons_cmd = "kafka-consumer-offset-checker.sh --zookeeper zookeeper:2181 --group {} --topic {}" + + groups_topics = [ + ("thresh-event", "events"), + ("log-transformer", "log"), + ("log-persister", "log-transformed"), + ("log-metric", "log-transformed"), + ("1_metrics", "metrics"), + ("thresh-metric", "metrics") + ] + bad_lag = False + for row in groups_topics: + check_cmd = cons_cmd.format(row[0], row[1]) + try: + resp = subprocess.check_output( + DOCKER_EXEC + ["kafka", + "ash", "-c", check_cmd], + stderr=subprocess.STDOUT, universal_newlines=True + ) + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + # Parse output from listing partitions + reader = csv.reader(resp.split('\n'), delimiter=' ', skipinitialspace=True) + # Remove depreciation waring and row with column titles + partition_list = list(reader)[2:] + + lags = [] + for partition in partition_list: + if len(partition) > 1: + # Take values only form `Lag` column + lags.append(int(partition[5])) + biggest_lag = sorted(lags, reverse=True)[0] + if biggest_lag > KAFKA_PROBLEM_LAG: + print("Lag for group `{}`, topic `{}` grow over {}. Biggest found lag {}".format( + row[0], row[1], KAFKA_PROBLEM_LAG, biggest_lag)) + print("You can print all lags with: `{} kafka ash -c '{}'`".format( + " ".join(DOCKER_EXEC), check_cmd)) + bad_lag = True + + if bad_lag: + # If too big lag was found return with error + return 3 + return 0 +@check_print def test_mysql(): mysql_conn = "MYSQL_PWD=${MYSQL_ROOT_PASSWORD} mysql --silent --skip-column-names " try: resp = subprocess.check_output( - docker_exec + ["mysql", + DOCKER_EXEC + ["mysql", "bash", "-c", mysql_conn + "-e 'show databases;'"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -150,7 +220,7 @@ def test_mysql(): try: max_conn = subprocess.check_output( - docker_exec + ["mysql", + DOCKER_EXEC + ["mysql", "bash", "-c", mysql_conn + "-e 'select @@max_connections;'"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -161,7 +231,7 @@ def test_mysql(): try: conn = subprocess.check_output( - docker_exec + ["mysql", + DOCKER_EXEC + ["mysql", "bash", "-c", mysql_conn + "-e 'SHOW STATUS WHERE `variable_name` = \"Threads_connected\";' | cut -f2"], stderr=subprocess.STDOUT, universal_newlines=True @@ -182,10 +252,11 @@ def test_mysql(): return 0 +@check_print def test_monasca(): try: resp = subprocess.check_output( - docker_exec + ["monasca", + DOCKER_EXEC + ["monasca", "ash", "-c", "curl http://localhost:8070/healthcheck"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -202,10 +273,11 @@ def test_monasca(): return 0 +@check_print def test_grafana(): try: resp = subprocess.check_output( - docker_exec + ["grafana", + DOCKER_EXEC + ["grafana", "ash", "-c", "wget -qO- http://localhost:3000/api/health"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -232,10 +304,11 @@ def test_grafana(): # ############################################################################### +@check_print def test_elasticsearch(): try: resp = subprocess.check_output( - docker_exec + ["elasticsearch", + DOCKER_EXEC + ["elasticsearch", "ash", "-c", "curl -XGET 'localhost:9200/_cluster/health?pretty'"], stderr=subprocess.STDOUT, universal_newlines=True ) @@ -256,10 +329,11 @@ def test_elasticsearch(): return 0 +@check_print def test_elasticsearch_curator(): try: resp = subprocess.check_output( - docker_exec + ["elasticsearch-curator", + DOCKER_EXEC + ["elasticsearch-curator", "ash", "-c", "curator --dry-run --config /config.yml /action.yml"], stderr=subprocess.STDOUT, universal_newlines=True ) From f2ed262e711899ca58db6905d13325fe232df375 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Mon, 24 Feb 2020 09:32:16 +0100 Subject: [PATCH 05/22] Check docker events for restarts and oom Signed-off-by: Dobroslaw Zybort --- cmm-check-health.py | 161 ++++++++++++++++++++++++++++---------------- 1 file changed, 104 insertions(+), 57 deletions(-) diff --git a/cmm-check-health.py b/cmm-check-health.py index 502d2e0b3..fa657b57e 100644 --- a/cmm-check-health.py +++ b/cmm-check-health.py @@ -1,13 +1,12 @@ #!/usr/bin/env python # coding=utf-8 -from __future__ import print_function - import csv import json import subprocess import sys +from shlex import shlex from time import localtime, gmtime, strftime ############################################################################### @@ -19,6 +18,9 @@ # Report warning when Kafka lag jump over this value KAFKA_PROBLEM_LAG = 20000 +# After this number of restarts of one service issue warning to operator +MAX_RESTARTS = 10 + # String for using docker-compose to exec commands in all services DOCKER_EXEC = ["docker-compose", "-f", "docker-compose-metric.yml", @@ -30,21 +32,18 @@ print("Local time {}".format(strftime("%Y-%m-%d %H:%M:%S", localtime()))) print("UTC time {}".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()))) -def check_print(func): - def func_wrapper(): - # Print func name with "test_" stripped - print("Checking '{}'".format(func.__name__[5:])) - return func() - return func_wrapper def print_info(service_name, test_function): CGREEN = '\033[92m' CRED = '\033[91m' CEND = '\033[0m' - if test_function != 0: + + print("Checking '{}'".format(service_name)) + + if test_function() != 0: print("\n{}❌{} There is problem with {}\n".format(CRED, CEND, service_name)) else: - print("{}✔{} {} is fine".format(CGREEN, CEND, service_name)) + print("{}✔{} {} looks fine".format(CGREEN, CEND, service_name)) ############################################################################### @@ -53,7 +52,6 @@ def print_info(service_name, test_function): # ############################################################################### -@check_print def test_memcached(): try: resp = subprocess.check_output( @@ -68,12 +66,11 @@ def test_memcached(): if "get_hits" not in resp: print("There is problem with Memcached") - return 2 + return 1 return 0 -@check_print def test_influxdb(): try: dbs = subprocess.check_output( @@ -88,12 +85,11 @@ def test_influxdb(): if "mon" not in dbs: print("Database 'mon' was not found in InfluxDB") - return 2 + return 1 return 0 -@check_print def test_cadvisor(): try: resp = subprocess.check_output( @@ -108,12 +104,11 @@ def test_cadvisor(): if "200 OK" not in resp: print("cAdvisor did not return properly") - return 2 + return 1 return 0 -@check_print def test_zookeeper(): try: resp = subprocess.check_output( @@ -128,12 +123,11 @@ def test_zookeeper(): if "zk_avg_latency" not in resp: print("Zookeeper did not return properly") - return 2 + return 1 return 0 -@check_print def test_kafka(): try: resp = subprocess.check_output( @@ -148,7 +142,7 @@ def test_kafka(): if "metrics" not in resp: print("'metrics' not found in Kafka topics") - return 2 + return 1 cons_cmd = "kafka-consumer-offset-checker.sh --zookeeper zookeeper:2181 --group {} --topic {}" @@ -194,12 +188,11 @@ def test_kafka(): if bad_lag: # If too big lag was found return with error - return 3 + return 1 return 0 -@check_print def test_mysql(): mysql_conn = "MYSQL_PWD=${MYSQL_ROOT_PASSWORD} mysql --silent --skip-column-names " @@ -216,7 +209,7 @@ def test_mysql(): if "mon" not in resp: print("'mon' database not found in MySQL") - return 2 + return 1 try: max_conn = subprocess.check_output( @@ -243,16 +236,15 @@ def test_mysql(): if int(conn) == int(max_conn): print("MySQL database is using all available connections") - return 3 + return 1 if int(conn) == 0: print("No one is connecting to MySQL database, is metrics API working properly?") - return 4 + return 1 return 0 -@check_print def test_monasca(): try: resp = subprocess.check_output( @@ -268,12 +260,11 @@ def test_monasca(): jresp = json.loads(resp) if jresp["error"]["title"] != "Unauthorized": print("Monasca API did not return properly") - return 2 + return 1 return 0 -@check_print def test_grafana(): try: resp = subprocess.check_output( @@ -288,12 +279,12 @@ def test_grafana(): if "database" not in resp: print("Grafana did not return properly") - return 2 + return 1 jresp = json.loads(resp) if jresp["database"] != "ok": print("Grafana reported problem with database: {}".format(jresp['database'])) - return 3 + return 1 return 0 @@ -304,7 +295,6 @@ def test_grafana(): # ############################################################################### -@check_print def test_elasticsearch(): try: resp = subprocess.check_output( @@ -319,17 +309,16 @@ def test_elasticsearch(): if "monasca" not in resp: print("Elasticsearch did not have 'monasca' cluster") - return 2 + return 1 jresp = json.loads(resp) if jresp["status"] == "red": print("Elasticsearch health check reports problem with cluster") - return 2 + return 1 return 0 -@check_print def test_elasticsearch_curator(): try: resp = subprocess.check_output( @@ -344,34 +333,92 @@ def test_elasticsearch_curator(): if "delete_indices" not in resp: print("Elasticsearch Curator did not run properly") - return 2 + return 1 return 0 +############################################################################### +# +# Global Docker checks +# +############################################################################### + +def test_docker_events(): + try: + resp = subprocess.check_output( + ["docker", "system", "events", + "--filter", "event=die", "--filter", "event=oom", + "--since=24h", "--until=1s"], + stderr=subprocess.STDOUT, universal_newlines=True + ) + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + filtered_list = {} + + return_error = 0 + for row in resp.splitlines(): + + tags = row[row.find('(')+1:-1] + lexer = shlex(tags, posix=True) + # Separate words + lexer.whitespace = ", " + # Split only on whitespace chars + lexer.whitespace_split = True + # "=" is part of the word + lexer.wordchars += "=" + # Separate key=value pairs to dict, split each pair only on first "=" + parsed_row = dict(word.split("=", 1) for word in lexer) + service = parsed_row["com.docker.compose.service"] + + # Check for out of memory errors + if "container oom" in row: + print(" Service '{}' got killed in the last 24 hours because " + "of out of memory error, please check" + .format(service)) + return_error = 1 + + if service not in filtered_list: + filtered_list[service] = {"restarts": 0} + filtered_list[service]["restarts"] += 1 + + for key in filtered_list: + if filtered_list[key]["restarts"] > MAX_RESTARTS: + print(" Service '{}' restarted at least {} times in last " + "24 hours, please check" + .format(key, filtered_list[key]["restarts"])) + return_error = 1 + + return return_error + +print_info("Docker events", test_docker_events) + # Metrics services -print_info("Memcached", test_memcached()) -print_info("InfluxDB", test_influxdb()) -print_info("cAdvisor", test_cadvisor()) -# print_info("Monasca Agent Forwarder", test_agent_forwarder()) -# print_info("Monasca Agent Collector", test_agent-collector()) -print_info("Zookeeper", test_zookeeper()) -print_info("Kafka", test_kafka()) -print_info("MySQL", test_mysql()) -print_info("Monasca API", test_monasca()) -# print_info("Monasca Persister", test_monasca_persister()) -# print_info("Monasca Thresh", test_thresh()) -# print_info("Monasca Notification", test_monasca_notification()) -print_info("Grafana", test_grafana()) +print_info("Memcached", test_memcached) +print_info("InfluxDB", test_influxdb) +print_info("cAdvisor", test_cadvisor) +# print_info("Monasca Agent Forwarder", test_agent_forwarder +# print_info("Monasca Agent Collector", test_agent-collector +print_info("Zookeeper", test_zookeeper) +print_info("Kafka", test_kafka) +print_info("MySQL", test_mysql) +print_info("Monasca API", test_monasca) +# print_info("Monasca Persister", test_monasca_persister) +# print_info("Monasca Thresh", test_thresh) +# print_info("Monasca Notification", test_monasca_notification) +print_info("Grafana", test_grafana) # Logs services -# print_info("Monasca Log Metrics", test_log_metrics()) -# print_info("Monasca Log Persister", test_log_persister()) -# print_info("Monasca Log Transformer", test_log_transformer()) -print_info("Elasticsearch", test_elasticsearch()) -print_info("Elasticsearch Curator", test_elasticsearch_curator()) -# print_info("Kibana", test_kibana()) -# print_info("Monasca Log API", test_log_api()) -# print_info("Monasca Log Agent", test_log_agent()) -# print_info("Monasca Logspout", test_logspout()) +# print_info("Monasca Log Metrics", test_log_metrics) +# print_info("Monasca Log Persister", test_log_persister) +# print_info("Monasca Log Transformer", test_log_transformer) +print_info("Elasticsearch", test_elasticsearch) +print_info("Elasticsearch Curator", test_elasticsearch_curator) +# print_info("Kibana", test_kibana) +# print_info("Monasca Log API", test_log_api) +# print_info("Monasca Log Agent", test_log_agent) +# print_info("Monasca Logspout", test_logspout) From 7b782f60d9e7f35f7433cb29d1afce698849d7e3 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Tue, 25 Feb 2020 17:35:41 +0100 Subject: [PATCH 06/22] Move script to tools directory Add Kibana check. --- .../check-health/cmm-check-health.py | 48 ++++++++++++++++--- 1 file changed, 42 insertions(+), 6 deletions(-) rename cmm-check-health.py => tools/check-health/cmm-check-health.py (89%) diff --git a/cmm-check-health.py b/tools/check-health/cmm-check-health.py similarity index 89% rename from cmm-check-health.py rename to tools/check-health/cmm-check-health.py index fa657b57e..e97d45f39 100644 --- a/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -3,6 +3,7 @@ import csv import json +import os import subprocess import sys @@ -21,10 +22,18 @@ # After this number of restarts of one service issue warning to operator MAX_RESTARTS = 10 +# Script directory +script_dir = os.path.dirname(os.path.abspath(__file__)) +# Get out of tools dir to root dir with docker-compose yaml files +root_dir = os.path.normpath(os.path.join(script_dir, os.path.pardir, os.path.pardir)) +compose_metrics_path = os.path.join(root_dir, "docker-compose-metric.yml") +compose_logs_path = os.path.join(root_dir, "docker-compose-log.yml") + # String for using docker-compose to exec commands in all services DOCKER_EXEC = ["docker-compose", - "-f", "docker-compose-metric.yml", - "-f", "docker-compose-log.yml", + "--project-directory", root_dir, + "--file", compose_metrics_path, + "--file", compose_logs_path, "exec"] @@ -176,7 +185,7 @@ def test_kafka(): lags = [] for partition in partition_list: if len(partition) > 1: - # Take values only form `Lag` column + # Take values only from `Lag` column lags.append(int(partition[5])) biggest_lag = sorted(lags, reverse=True)[0] if biggest_lag > KAFKA_PROBLEM_LAG: @@ -337,6 +346,25 @@ def test_elasticsearch_curator(): return 0 +def test_kibana(): + try: + resp = subprocess.check_output( + DOCKER_EXEC + ["kibana", + "sh", "-c", "wget -qO- http://localhost:5601/api/status"], + stderr=subprocess.STDOUT, universal_newlines=True + ) + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + jresp = json.loads(resp) + if jresp["status"]["overall"]["state"] != "green": + print("Kibana health check reports problem") + return 1 + + return 0 + ############################################################################### # @@ -394,14 +422,22 @@ def test_docker_events(): return return_error + +############################################################################### +# +# Run checks +# +############################################################################### + + print_info("Docker events", test_docker_events) # Metrics services print_info("Memcached", test_memcached) print_info("InfluxDB", test_influxdb) print_info("cAdvisor", test_cadvisor) -# print_info("Monasca Agent Forwarder", test_agent_forwarder -# print_info("Monasca Agent Collector", test_agent-collector +# print_info("Monasca Agent Forwarder", test_agent_forwarder) +# print_info("Monasca Agent Collector", test_agent-collector) print_info("Zookeeper", test_zookeeper) print_info("Kafka", test_kafka) print_info("MySQL", test_mysql) @@ -418,7 +454,7 @@ def test_docker_events(): # print_info("Monasca Log Transformer", test_log_transformer) print_info("Elasticsearch", test_elasticsearch) print_info("Elasticsearch Curator", test_elasticsearch_curator) -# print_info("Kibana", test_kibana) +print_info("Kibana", test_kibana) # print_info("Monasca Log API", test_log_api) # print_info("Monasca Log Agent", test_log_agent) # print_info("Monasca Logspout", test_logspout) From 7c7e08c8553df6315f551b51172de929e64499d5 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Tue, 25 Feb 2020 18:34:07 +0100 Subject: [PATCH 07/22] Add README file Signed-off-by: Dobroslaw Zybort --- tools/check-health/README.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tools/check-health/README.md diff --git a/tools/check-health/README.md b/tools/check-health/README.md new file mode 100644 index 000000000..1a525dae8 --- /dev/null +++ b/tools/check-health/README.md @@ -0,0 +1,36 @@ +# CMM Health check script + +Main aim of this script is fast and on the spot checking of health of running +Cloud Monitoring Manager components. It's checking for most common problems +so it's not 100% foolproof but could provide operator with potential ideas +where some problems could started or where problems could arise in the close +future. + +## Running script + +Script is compatible with both Python 2 and Python 3. + +```bash +python3 cmm-check-health.py +``` + +or + +```bash +python2 cmm-check-health.py +``` + +## Checks provided by the script + +* Checking Docker events for number of restarts of every service in the last + 24 hours (report warning when more than 10 restarts happen). +* Checking for number of restarts because "out of memory" errors (report on + every such event). +* All services with the ability to check they status with some kind of request + to them this request is done from inside they containers. +* Checking output from previous requests for containing specific text (like + if proper database exists or status is "green"). +* For MySQL: + * Is anyone connected to the database? + * Is MySQL database using all available connections? +* Check lags in Kafka topics. From 39f9b9395f3f02a01c8f58b1cc7b332c1db2ca6b Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Wed, 26 Feb 2020 11:14:59 +0100 Subject: [PATCH 08/22] Check all Kafka topics for existence Signed-off-by: Dobroslaw Zybort --- tools/check-health/cmm-check-health.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index e97d45f39..2a92bad05 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -149,9 +149,20 @@ def test_kafka(): print(exc) return 1 - if "metrics" not in resp: - print("'metrics' not found in Kafka topics") - return 1 + kafka_topics = [ + "60-seconds-notifications", + "alarm-notifications", + "alarm-state-transitions", + "events", + "log", + "log-transformed", + "metrics", + "retry-notifications" + ] + for topic in kafka_topics: + if topic not in resp: + print("'{}' not found in Kafka topics".format(topic)) + return 1 cons_cmd = "kafka-consumer-offset-checker.sh --zookeeper zookeeper:2181 --group {} --topic {}" From f7c6fb9af87b89b9bef8ea330ad632790cb2c1e1 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Wed, 26 Feb 2020 11:31:45 +0100 Subject: [PATCH 09/22] Remove return 0 from all functions Signed-off-by: Dobroslaw Zybort --- tools/check-health/cmm-check-health.py | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index 2a92bad05..64596e55c 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -49,7 +49,7 @@ def print_info(service_name, test_function): print("Checking '{}'".format(service_name)) - if test_function() != 0: + if test_function() is not None: print("\n{}❌{} There is problem with {}\n".format(CRED, CEND, service_name)) else: print("{}✔{} {} looks fine".format(CGREEN, CEND, service_name)) @@ -77,8 +77,6 @@ def test_memcached(): print("There is problem with Memcached") return 1 - return 0 - def test_influxdb(): try: @@ -96,8 +94,6 @@ def test_influxdb(): print("Database 'mon' was not found in InfluxDB") return 1 - return 0 - def test_cadvisor(): try: @@ -115,8 +111,6 @@ def test_cadvisor(): print("cAdvisor did not return properly") return 1 - return 0 - def test_zookeeper(): try: @@ -134,8 +128,6 @@ def test_zookeeper(): print("Zookeeper did not return properly") return 1 - return 0 - def test_kafka(): try: @@ -210,8 +202,6 @@ def test_kafka(): # If too big lag was found return with error return 1 - return 0 - def test_mysql(): mysql_conn = "MYSQL_PWD=${MYSQL_ROOT_PASSWORD} mysql --silent --skip-column-names " @@ -262,8 +252,6 @@ def test_mysql(): print("No one is connecting to MySQL database, is metrics API working properly?") return 1 - return 0 - def test_monasca(): try: @@ -282,8 +270,6 @@ def test_monasca(): print("Monasca API did not return properly") return 1 - return 0 - def test_grafana(): try: @@ -306,8 +292,6 @@ def test_grafana(): print("Grafana reported problem with database: {}".format(jresp['database'])) return 1 - return 0 - ############################################################################### # @@ -336,8 +320,6 @@ def test_elasticsearch(): print("Elasticsearch health check reports problem with cluster") return 1 - return 0 - def test_elasticsearch_curator(): try: @@ -355,7 +337,6 @@ def test_elasticsearch_curator(): print("Elasticsearch Curator did not run properly") return 1 - return 0 def test_kibana(): try: @@ -374,8 +355,6 @@ def test_kibana(): print("Kibana health check reports problem") return 1 - return 0 - ############################################################################### # @@ -398,7 +377,7 @@ def test_docker_events(): filtered_list = {} - return_error = 0 + return_error = None for row in resp.splitlines(): tags = row[row.find('(')+1:-1] From b5ca557902f24fa3cd4009489930794902bcd560 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Tue, 3 Mar 2020 12:44:10 +0100 Subject: [PATCH 10/22] Add command line args support Signed-off-by: Dobroslaw Zybort --- tools/check-health/README.md | 14 ++ tools/check-health/cmm-check-health.py | 247 ++++++++++++++----------- 2 files changed, 157 insertions(+), 104 deletions(-) diff --git a/tools/check-health/README.md b/tools/check-health/README.md index 1a525dae8..35e8ac808 100644 --- a/tools/check-health/README.md +++ b/tools/check-health/README.md @@ -20,6 +20,20 @@ or python2 cmm-check-health.py ``` +### Command line arguments + +You can use the following arguments to script: + +| Short | Long | Default | Description | +|-------|----------------|---------|------------------------------------------------------------| +| -m | --metrics | False | Check metrics pipeline | +| -l | --logs | False | Check logs pipeline | +| -k | --kafka-lag | 20000 | Report warning when Kafka lag jump over this value | +| -r | --max-restarts | 10 | After this number of restarts of one service issue warning | + +If you start script without `--metrics` and `--logs` arguments both pipelines +will be checked. + ## Checks provided by the script * Checking Docker events for number of restarts of every service in the last diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index 64596e55c..ff8e46156 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -5,8 +5,8 @@ import json import os import subprocess -import sys +from argparse import ArgumentParser from shlex import shlex from time import localtime, gmtime, strftime @@ -16,12 +16,6 @@ # ############################################################################### -# Report warning when Kafka lag jump over this value -KAFKA_PROBLEM_LAG = 20000 - -# After this number of restarts of one service issue warning to operator -MAX_RESTARTS = 10 - # Script directory script_dir = os.path.dirname(os.path.abspath(__file__)) # Get out of tools dir to root dir with docker-compose yaml files @@ -36,6 +30,29 @@ "--file", compose_logs_path, "exec"] +prog_desc = "Cloud Monitoring Manager health check script." +parser = ArgumentParser(description=prog_desc) + +parser.add_argument( + "-m", "--metrics", action="store_true", + help="Check metrics pipeline") +parser.add_argument( + "-l", "--logs", action="store_true", + help="Check logs pipeline") + +parser.add_argument( + "-k", "--kafka-lag", default=20000, type=int, + help="Report warning when Kafka lag jump over this value") +parser.add_argument( + "-r", "--max-restarts", default=10, type=int, + help="After this number of restarts of one service issue warning") + +ARGS = parser.parse_args() + +# No arguments provided, check both pipelines +if not ARGS.metrics and not ARGS.logs: + ARGS.metrics = True + ARGS.logs = True print("Running simple tests of running Monasca services") print("Local time {}".format(strftime("%Y-%m-%d %H:%M:%S", localtime()))) @@ -129,80 +146,6 @@ def test_zookeeper(): return 1 -def test_kafka(): - try: - resp = subprocess.check_output( - DOCKER_EXEC + ["kafka", - "ash", "-c", "kafka-topics.sh --list --zookeeper zookeeper:2181"], - stderr=subprocess.STDOUT, universal_newlines=True - ) - except subprocess.CalledProcessError as exc: - print(exc.output) - print(exc) - return 1 - - kafka_topics = [ - "60-seconds-notifications", - "alarm-notifications", - "alarm-state-transitions", - "events", - "log", - "log-transformed", - "metrics", - "retry-notifications" - ] - for topic in kafka_topics: - if topic not in resp: - print("'{}' not found in Kafka topics".format(topic)) - return 1 - - cons_cmd = "kafka-consumer-offset-checker.sh --zookeeper zookeeper:2181 --group {} --topic {}" - - groups_topics = [ - ("thresh-event", "events"), - ("log-transformer", "log"), - ("log-persister", "log-transformed"), - ("log-metric", "log-transformed"), - ("1_metrics", "metrics"), - ("thresh-metric", "metrics") - ] - bad_lag = False - for row in groups_topics: - check_cmd = cons_cmd.format(row[0], row[1]) - try: - resp = subprocess.check_output( - DOCKER_EXEC + ["kafka", - "ash", "-c", check_cmd], - stderr=subprocess.STDOUT, universal_newlines=True - ) - except subprocess.CalledProcessError as exc: - print(exc.output) - print(exc) - return 1 - - # Parse output from listing partitions - reader = csv.reader(resp.split('\n'), delimiter=' ', skipinitialspace=True) - # Remove depreciation waring and row with column titles - partition_list = list(reader)[2:] - - lags = [] - for partition in partition_list: - if len(partition) > 1: - # Take values only from `Lag` column - lags.append(int(partition[5])) - biggest_lag = sorted(lags, reverse=True)[0] - if biggest_lag > KAFKA_PROBLEM_LAG: - print("Lag for group `{}`, topic `{}` grow over {}. Biggest found lag {}".format( - row[0], row[1], KAFKA_PROBLEM_LAG, biggest_lag)) - print("You can print all lags with: `{} kafka ash -c '{}'`".format( - " ".join(DOCKER_EXEC), check_cmd)) - bad_lag = True - - if bad_lag: - # If too big lag was found return with error - return 1 - - def test_mysql(): mysql_conn = "MYSQL_PWD=${MYSQL_ROOT_PASSWORD} mysql --silent --skip-column-names " @@ -356,6 +299,97 @@ def test_kibana(): return 1 +############################################################################### +# +# Cross pipeline services +# +############################################################################### + +def test_kafka(): + try: + resp = subprocess.check_output( + DOCKER_EXEC + ["kafka", + "ash", "-c", "kafka-topics.sh --list --zookeeper zookeeper:2181"], + stderr=subprocess.STDOUT, universal_newlines=True + ) + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + kafka_topics = [] + if ARGS.metrics: + kafka_topics.extend([ + "60-seconds-notifications", + "alarm-notifications", + "alarm-state-transitions", + "events", + "metrics", + "retry-notifications" + ]) + if ARGS.logs: + kafka_topics.extend([ + "log", + "log-transformed" + ]) + + for topic in kafka_topics: + if topic not in resp: + print("'{}' not found in Kafka topics".format(topic)) + return 1 + + cons_cmd = "kafka-consumer-offset-checker.sh --zookeeper zookeeper:2181 --group {} --topic {}" + + groups_topics = [] + if ARGS.metrics: + groups_topics.extend([ + ("thresh-event", "events"), + ("1_metrics", "metrics"), + ("thresh-metric", "metrics") + ]) + if ARGS.logs: + groups_topics.extend([ + ("log-transformer", "log"), + ("log-persister", "log-transformed"), + ("log-metric", "log-transformed") + ]) + bad_lag = False + for row in groups_topics: + check_cmd = cons_cmd.format(row[0], row[1]) + try: + resp = subprocess.check_output( + DOCKER_EXEC + ["kafka", + "ash", "-c", check_cmd], + stderr=subprocess.STDOUT, universal_newlines=True + ) + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + # Parse output from listing partitions + reader = csv.reader(resp.split('\n'), delimiter=' ', skipinitialspace=True) + # Remove depreciation waring and row with column titles + partition_list = list(reader)[2:] + + lags = [] + for partition in partition_list: + if len(partition) > 1: + # Take values only from `Lag` column + lags.append(int(partition[5])) + biggest_lag = sorted(lags, reverse=True)[0] + if biggest_lag > ARGS.kafka_lag: + print("Lag for group `{}`, topic `{}` grow over {}. Biggest found lag {}".format( + row[0], row[1], ARGS.kafka_lag, biggest_lag)) + print("You can print all lags with: `{} kafka ash -c '{}'`".format( + " ".join(DOCKER_EXEC), check_cmd)) + bad_lag = True + + if bad_lag: + # If too big lag was found return with error + return 1 + + ############################################################################### # # Global Docker checks @@ -404,7 +438,7 @@ def test_docker_events(): filtered_list[service]["restarts"] += 1 for key in filtered_list: - if filtered_list[key]["restarts"] > MAX_RESTARTS: + if filtered_list[key]["restarts"] > ARGS.max_restarts: print(" Service '{}' restarted at least {} times in last " "24 hours, please check" .format(key, filtered_list[key]["restarts"])) @@ -423,28 +457,33 @@ def test_docker_events(): print_info("Docker events", test_docker_events) # Metrics services -print_info("Memcached", test_memcached) -print_info("InfluxDB", test_influxdb) -print_info("cAdvisor", test_cadvisor) -# print_info("Monasca Agent Forwarder", test_agent_forwarder) -# print_info("Monasca Agent Collector", test_agent-collector) -print_info("Zookeeper", test_zookeeper) -print_info("Kafka", test_kafka) -print_info("MySQL", test_mysql) -print_info("Monasca API", test_monasca) -# print_info("Monasca Persister", test_monasca_persister) -# print_info("Monasca Thresh", test_thresh) -# print_info("Monasca Notification", test_monasca_notification) -print_info("Grafana", test_grafana) +if ARGS.metrics: + print_info("Memcached", test_memcached) + print_info("InfluxDB", test_influxdb) + print_info("cAdvisor", test_cadvisor) + # print_info("Monasca Agent Forwarder", test_agent_forwarder) + # print_info("Monasca Agent Collector", test_agent-collector) + print_info("Zookeeper", test_zookeeper) + print_info("MySQL", test_mysql) + print_info("Monasca API", test_monasca) + # print_info("Monasca Persister", test_monasca_persister) + # print_info("Monasca Thresh", test_thresh) + # print_info("Monasca Notification", test_monasca_notification) + print_info("Grafana", test_grafana) # Logs services -# print_info("Monasca Log Metrics", test_log_metrics) -# print_info("Monasca Log Persister", test_log_persister) -# print_info("Monasca Log Transformer", test_log_transformer) -print_info("Elasticsearch", test_elasticsearch) -print_info("Elasticsearch Curator", test_elasticsearch_curator) -print_info("Kibana", test_kibana) -# print_info("Monasca Log API", test_log_api) -# print_info("Monasca Log Agent", test_log_agent) -# print_info("Monasca Logspout", test_logspout) +if ARGS.logs: + # print_info("Monasca Log Metrics", test_log_metrics) + # print_info("Monasca Log Persister", test_log_persister) + # print_info("Monasca Log Transformer", test_log_transformer) + print_info("Elasticsearch", test_elasticsearch) + print_info("Elasticsearch Curator", test_elasticsearch_curator) + print_info("Kibana", test_kibana) + # print_info("Monasca Log API", test_log_api) + # print_info("Monasca Log Agent", test_log_agent) + # print_info("Monasca Logspout", test_logspout) + +# Cross pipeline services +if ARGS.metrics or ARGS.logs: + print_info("Kafka", test_kafka) From ec15183734781d8d662931685581a20e9abf3da7 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Thu, 12 Mar 2020 17:20:11 +0100 Subject: [PATCH 11/22] Use docker inspect for checking number of restarts Check this number only if user request for it himself, too many false positives that would scare operator. Signed-off-by: Dobroslaw Zybort --- tools/check-health/README.md | 45 ++++++++++--- tools/check-health/cmm-check-health.py | 90 +++++++++++++++++++++----- 2 files changed, 108 insertions(+), 27 deletions(-) diff --git a/tools/check-health/README.md b/tools/check-health/README.md index 35e8ac808..4b88d14ac 100644 --- a/tools/check-health/README.md +++ b/tools/check-health/README.md @@ -24,22 +24,30 @@ python2 cmm-check-health.py You can use the following arguments to script: -| Short | Long | Default | Description | -|-------|----------------|---------|------------------------------------------------------------| -| -m | --metrics | False | Check metrics pipeline | -| -l | --logs | False | Check logs pipeline | -| -k | --kafka-lag | 20000 | Report warning when Kafka lag jump over this value | -| -r | --max-restarts | 10 | After this number of restarts of one service issue warning | +| Short | Long | Default | Description | +| ----- | -------------- | ------- | --------------------------------------------------- | +| -m | --metrics | False | Check metrics pipeline | +| -l | --logs | False | Check logs pipeline | +| -k | --kafka-lag | 20000 | Report warning when Kafka lag jump over this value | +| -r | --max-restarts | -1 | After this number of service restarts issue warning | If you start script without `--metrics` and `--logs` arguments both pipelines will be checked. +```bash +python3 cmm-check-health.py -k=100 -m +``` + +Max restarts check is disabled by default because of too many false positives. +If you want to run it to check if number of restarts from the start of all +services is bigger than 20 use following command: + +```bash +python3 cmm-check-health.py -r=20 +``` + ## Checks provided by the script -* Checking Docker events for number of restarts of every service in the last - 24 hours (report warning when more than 10 restarts happen). -* Checking for number of restarts because "out of memory" errors (report on - every such event). * All services with the ability to check they status with some kind of request to them this request is done from inside they containers. * Checking output from previous requests for containing specific text (like @@ -48,3 +56,20 @@ will be checked. * Is anyone connected to the database? * Is MySQL database using all available connections? * Check lags in Kafka topics. +* Checking Docker for number of restarts of every service from the time they + was created (report warning when more than 10 restarts happen). +* Checking if any service was restarted because "out of memory" error. + +## Checking number of service restarts + +It's impossible to check exact number of restarts of services in the last +24 hours. Theoretically `docker events` provide this functionality but it's +limited to last 256 events. In CMM case that have a lot of containers running +at the same time on one machine it's useless because it showing only last +4 minutes of events. + +If you still want to check Docker events use the following command: + +```bash +docker events --filter event=die --filter event=oom --since=24h --until=1s +``` diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index ff8e46156..7207a63fb 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -44,7 +44,7 @@ "-k", "--kafka-lag", default=20000, type=int, help="Report warning when Kafka lag jump over this value") parser.add_argument( - "-r", "--max-restarts", default=10, type=int, + "-r", "--max-restarts", default=-1, type=int, help="After this number of restarts of one service issue warning") ARGS = parser.parse_args() @@ -379,7 +379,7 @@ def test_kafka(): lags.append(int(partition[5])) biggest_lag = sorted(lags, reverse=True)[0] if biggest_lag > ARGS.kafka_lag: - print("Lag for group `{}`, topic `{}` grow over {}. Biggest found lag {}".format( + print("Lag for group `{}`, topic `{}` grow over {}. Biggest lag found: {}".format( row[0], row[1], ARGS.kafka_lag, biggest_lag)) print("You can print all lags with: `{} kafka ash -c '{}'`".format( " ".join(DOCKER_EXEC), check_cmd)) @@ -396,10 +396,14 @@ def test_kafka(): # ############################################################################### +# TODO: Not working properly with 20 Docker containers on one machine. +# Docker events provide only last 256 events and even health checks are logged +# so with all our services working on one machine it's provide us with events +# only from the last 4 minutes... def test_docker_events(): try: resp = subprocess.check_output( - ["docker", "system", "events", + ["docker", "events", "--filter", "event=die", "--filter", "event=oom", "--since=24h", "--until=1s"], stderr=subprocess.STDOUT, universal_newlines=True @@ -447,43 +451,95 @@ def test_docker_events(): return return_error +# test_docker_restarts will report number of Docker container restarts from +# the time it was created/started (like with `docker-compose up`). +def test_docker_restarts(): + try: + resp = subprocess.check_output( + ["docker inspect --format \ + 'ID={{.ID}} CREATED={{.Created}} RESTARTS={{.RestartCount}} \ + OOM={{.State.OOMKilled}} NAME={{.Name}}' \ + $(docker ps -aq)"], shell=True, + stderr=subprocess.STDOUT, universal_newlines=True + ) + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + return_error = None + for row in resp.splitlines(): + lexer = shlex(row, posix=True) + # Separate words + lexer.whitespace = ", " + # Split only on whitespace chars + lexer.whitespace_split = True + # "=" is part of the word + lexer.wordchars += "=" + # Separate key=value pairs to dict, split each pair only on first "=" + parsed_row = dict(word.split("=", 1) for word in lexer) + + # Check for number of restarts + if int(parsed_row["RESTARTS"]) > ARGS.max_restarts: + print(" Service '{}' restarted at least {} times from the time " + "it was started: {}, please check" + .format(parsed_row["NAME"], + parsed_row["RESTARTS"], + # Remove milliseconds from creation time + parsed_row["CREATED"].split(".", 1)[0], + )) + return_error = 1 + + # Check if service got out of memmory error + if parsed_row["OOM"] != "false": + print(" Service '{}' was restarted because of out of memmory error, " + "please check" + .format(parsed_row["NAME"])) + return_error = 1 + + return return_error + ############################################################################### # # Run checks # ############################################################################### - -print_info("Docker events", test_docker_events) - # Metrics services if ARGS.metrics: print_info("Memcached", test_memcached) print_info("InfluxDB", test_influxdb) print_info("cAdvisor", test_cadvisor) - # print_info("Monasca Agent Forwarder", test_agent_forwarder) - # print_info("Monasca Agent Collector", test_agent-collector) + # print_info("Monasca Agent Forwarder", test_agent_forwarder) // no healthcheck + # print_info("Monasca Agent Collector", test_agent-collector) // no healthcheck print_info("Zookeeper", test_zookeeper) print_info("MySQL", test_mysql) print_info("Monasca API", test_monasca) - # print_info("Monasca Persister", test_monasca_persister) - # print_info("Monasca Thresh", test_thresh) - # print_info("Monasca Notification", test_monasca_notification) + # print_info("Monasca Persister", test_monasca_persister) // no healthcheck + # print_info("Monasca Thresh", test_thresh) // no healthcheck + # print_info("Monasca Notification", test_monasca_notification) // no healthcheck print_info("Grafana", test_grafana) # Logs services if ARGS.logs: - # print_info("Monasca Log Metrics", test_log_metrics) - # print_info("Monasca Log Persister", test_log_persister) - # print_info("Monasca Log Transformer", test_log_transformer) + # print_info("Monasca Log Metrics", test_log_metrics) // no healthcheck + # print_info("Monasca Log Persister", test_log_persister) // no healthcheck + # print_info("Monasca Log Transformer", test_log_transformer) // no healthcheck print_info("Elasticsearch", test_elasticsearch) print_info("Elasticsearch Curator", test_elasticsearch_curator) print_info("Kibana", test_kibana) - # print_info("Monasca Log API", test_log_api) - # print_info("Monasca Log Agent", test_log_agent) - # print_info("Monasca Logspout", test_logspout) + # print_info("Monasca Log API", test_log_api) // no healthcheck + # print_info("Monasca Log Agent", test_log_agent) // no healthcheck + # print_info("Monasca Logspout", test_logspout) // no healthcheck # Cross pipeline services if ARGS.metrics or ARGS.logs: print_info("Kafka", test_kafka) + +# TODO: Not working properly with running 20 Docker containers on one machine. +# print_info("Docker events", test_docker_events) + +# Check number of restarts only if user request for it himself. +if ARGS.max_restarts > 0: + print_info("Docker restarts", test_docker_restarts) From 24c059f31237bd734d4dd0de71e823b405fa7cd1 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Wed, 25 Mar 2020 13:29:49 +0100 Subject: [PATCH 12/22] Add option to provide different folder with config files Signed-off-by: Dobroslaw Zybort --- tools/check-health/README.md | 1 + tools/check-health/cmm-check-health.py | 49 +++++++++++++++++++++----- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/tools/check-health/README.md b/tools/check-health/README.md index 4b88d14ac..a7ea6d8cf 100644 --- a/tools/check-health/README.md +++ b/tools/check-health/README.md @@ -30,6 +30,7 @@ You can use the following arguments to script: | -l | --logs | False | Check logs pipeline | | -k | --kafka-lag | 20000 | Report warning when Kafka lag jump over this value | | -r | --max-restarts | -1 | After this number of service restarts issue warning | +| -f | --folder | CMM dir | Folder with `.env` and docker-compose config files | If you start script without `--metrics` and `--logs` arguments both pipelines will be checked. diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index 7207a63fb..4dcf1a66c 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -5,6 +5,7 @@ import json import os import subprocess +import sys from argparse import ArgumentParser from shlex import shlex @@ -20,15 +21,6 @@ script_dir = os.path.dirname(os.path.abspath(__file__)) # Get out of tools dir to root dir with docker-compose yaml files root_dir = os.path.normpath(os.path.join(script_dir, os.path.pardir, os.path.pardir)) -compose_metrics_path = os.path.join(root_dir, "docker-compose-metric.yml") -compose_logs_path = os.path.join(root_dir, "docker-compose-log.yml") - -# String for using docker-compose to exec commands in all services -DOCKER_EXEC = ["docker-compose", - "--project-directory", root_dir, - "--file", compose_metrics_path, - "--file", compose_logs_path, - "exec"] prog_desc = "Cloud Monitoring Manager health check script." parser = ArgumentParser(description=prog_desc) @@ -47,8 +39,23 @@ "-r", "--max-restarts", default=-1, type=int, help="After this number of restarts of one service issue warning") +parser.add_argument( + "-f", "--folder", default=root_dir, + help="Folder with `.env` and docker-compose yaml config files") + ARGS = parser.parse_args() +dot_env_path = os.path.join(ARGS.folder, ".env") +compose_metrics_path = os.path.join(ARGS.folder, "docker-compose-metric.yml") +compose_logs_path = os.path.join(ARGS.folder, "docker-compose-log.yml") + +# String for using docker-compose to exec commands in all services +DOCKER_EXEC = ["docker-compose", + "--project-directory", ARGS.folder, + "--file", compose_metrics_path, + "--file", compose_logs_path, + "exec"] + # No arguments provided, check both pipelines if not ARGS.metrics and not ARGS.logs: ARGS.metrics = True @@ -72,6 +79,30 @@ def print_info(service_name, test_function): print("{}✔{} {} looks fine".format(CGREEN, CEND, service_name)) +############################################################################### +# +# Environment tests +# +############################################################################### + +print("Looking for `.env` and configuration files in: {}".format(ARGS.folder)) +if not os.path.isdir(ARGS.folder): + print("Folder does not exists: {}".format(ARGS.folder)) + print("Exiting") + sys.exit(1) + +config_files = [ + dot_env_path, + compose_metrics_path, + compose_logs_path +] +for cfile in config_files: + if not os.path.exists(cfile): + print("File does not exists: {}".format(cfile)) + print("Exiting") + sys.exit(1) + + ############################################################################### # # Metrics services From 23dfd5a1c0c452c3735ad8885e986f1166cf9a6c Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Wed, 25 Mar 2020 14:34:12 +0100 Subject: [PATCH 13/22] Catch wrong JSON reponses Signed-off-by: Dobroslaw Zybort --- tools/check-health/cmm-check-health.py | 28 ++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index 4dcf1a66c..a11d7d196 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -239,7 +239,12 @@ def test_monasca(): print(exc) return 1 - jresp = json.loads(resp) + try: + jresp = json.loads(resp) + except ValueError as ex: + print("Monasca API returned wrong JSON response: {}".format(resp)) + return 1 + if jresp["error"]["title"] != "Unauthorized": print("Monasca API did not return properly") return 1 @@ -261,7 +266,12 @@ def test_grafana(): print("Grafana did not return properly") return 1 - jresp = json.loads(resp) + try: + jresp = json.loads(resp) + except ValueError as ex: + print("Grafana returned wrong JSON response: {}".format(resp)) + return 1 + if jresp["database"] != "ok": print("Grafana reported problem with database: {}".format(jresp['database'])) return 1 @@ -289,7 +299,12 @@ def test_elasticsearch(): print("Elasticsearch did not have 'monasca' cluster") return 1 - jresp = json.loads(resp) + try: + jresp = json.loads(resp) + except ValueError as ex: + print("Elasticsearch returned wrong JSON response: {}".format(resp)) + return 1 + if jresp["status"] == "red": print("Elasticsearch health check reports problem with cluster") return 1 @@ -324,7 +339,12 @@ def test_kibana(): print(exc) return 1 - jresp = json.loads(resp) + try: + jresp = json.loads(resp) + except ValueError as ex: + print("Kibana returned wrong JSON response: {}".format(resp)) + return 1 + if jresp["status"]["overall"]["state"] != "green": print("Kibana health check reports problem") return 1 From 43ead70411b7ff52991b87f4db943246c07658a0 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Wed, 25 Mar 2020 20:49:03 +0100 Subject: [PATCH 14/22] Set dir for all subprocess commands Signed-off-by: Dobroslaw Zybort --- tools/check-health/cmm-check-health.py | 42 ++++++++++++++++---------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index a11d7d196..3f83210a2 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -85,6 +85,16 @@ def print_info(service_name, test_function): # ############################################################################### +try: + resp = subprocess.check_output(["docker-compose", "--version"], + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + ) +except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + sys.exit(1) +print(resp) + print("Looking for `.env` and configuration files in: {}".format(ARGS.folder)) if not os.path.isdir(ARGS.folder): print("Folder does not exists: {}".format(ARGS.folder)) @@ -114,7 +124,7 @@ def test_memcached(): resp = subprocess.check_output( DOCKER_EXEC + ["memcached", "ash", "-c", "echo stats | nc -w 1 127.0.0.1 11211"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -131,7 +141,7 @@ def test_influxdb(): dbs = subprocess.check_output( DOCKER_EXEC + ["influxdb", "influx", "-execute", "SHOW DATABASES"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -148,7 +158,7 @@ def test_cadvisor(): resp = subprocess.check_output( DOCKER_EXEC + ["cadvisor", "wget", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -165,7 +175,7 @@ def test_zookeeper(): resp = subprocess.check_output( DOCKER_EXEC + ["zookeeper", "bash", "-c", "echo mntr | nc -w 1 127.0.0.1 2181"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -184,7 +194,7 @@ def test_mysql(): resp = subprocess.check_output( DOCKER_EXEC + ["mysql", "bash", "-c", mysql_conn + "-e 'show databases;'"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -199,7 +209,7 @@ def test_mysql(): max_conn = subprocess.check_output( DOCKER_EXEC + ["mysql", "bash", "-c", mysql_conn + "-e 'select @@max_connections;'"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -211,7 +221,7 @@ def test_mysql(): DOCKER_EXEC + ["mysql", "bash", "-c", mysql_conn + "-e 'SHOW STATUS WHERE `variable_name` = \"Threads_connected\";' | cut -f2"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -232,7 +242,7 @@ def test_monasca(): resp = subprocess.check_output( DOCKER_EXEC + ["monasca", "ash", "-c", "curl http://localhost:8070/healthcheck"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -255,7 +265,7 @@ def test_grafana(): resp = subprocess.check_output( DOCKER_EXEC + ["grafana", "ash", "-c", "wget -qO- http://localhost:3000/api/health"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -288,7 +298,7 @@ def test_elasticsearch(): resp = subprocess.check_output( DOCKER_EXEC + ["elasticsearch", "ash", "-c", "curl -XGET 'localhost:9200/_cluster/health?pretty'"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -315,7 +325,7 @@ def test_elasticsearch_curator(): resp = subprocess.check_output( DOCKER_EXEC + ["elasticsearch-curator", "ash", "-c", "curator --dry-run --config /config.yml /action.yml"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -332,7 +342,7 @@ def test_kibana(): resp = subprocess.check_output( DOCKER_EXEC + ["kibana", "sh", "-c", "wget -qO- http://localhost:5601/api/status"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -361,7 +371,7 @@ def test_kafka(): resp = subprocess.check_output( DOCKER_EXEC + ["kafka", "ash", "-c", "kafka-topics.sh --list --zookeeper zookeeper:2181"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -411,7 +421,7 @@ def test_kafka(): resp = subprocess.check_output( DOCKER_EXEC + ["kafka", "ash", "-c", check_cmd], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -457,7 +467,7 @@ def test_docker_events(): ["docker", "events", "--filter", "event=die", "--filter", "event=oom", "--since=24h", "--until=1s"], - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -511,7 +521,7 @@ def test_docker_restarts(): 'ID={{.ID}} CREATED={{.Created}} RESTARTS={{.RestartCount}} \ OOM={{.State.OOMKilled}} NAME={{.Name}}' \ $(docker ps -aq)"], shell=True, - stderr=subprocess.STDOUT, universal_newlines=True + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: print(exc.output) From cba7b303837c3578e016bb27299c5c2f897c11f5 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Wed, 25 Mar 2020 21:08:00 +0100 Subject: [PATCH 15/22] Group import lines Signed-off-by: Dobroslaw Zybort --- tools/check-health/cmm-check-health.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index 3f83210a2..40d2a7110 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -6,7 +6,6 @@ import os import subprocess import sys - from argparse import ArgumentParser from shlex import shlex from time import localtime, gmtime, strftime From 965448a15a486eab7ffa65a302eaa6d7da5177df Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Thu, 26 Mar 2020 16:50:52 +0100 Subject: [PATCH 16/22] Use PORT in env varaibles for services Signed-off-by: Dobroslaw Zybort --- tools/check-health/cmm-check-health.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index 40d2a7110..2d3f2ccb4 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -120,6 +120,7 @@ def print_info(service_name, test_function): def test_memcached(): try: + # Memcached does not allow to change PORT inside the container resp = subprocess.check_output( DOCKER_EXEC + ["memcached", "ash", "-c", "echo stats | nc -w 1 127.0.0.1 11211"], @@ -154,6 +155,7 @@ def test_influxdb(): def test_cadvisor(): try: + # cAdvisor does not allow to change PORT inside the container resp = subprocess.check_output( DOCKER_EXEC + ["cadvisor", "wget", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"], @@ -171,6 +173,7 @@ def test_cadvisor(): def test_zookeeper(): try: + # Zookeeper does not allow to change PORT inside the container resp = subprocess.check_output( DOCKER_EXEC + ["zookeeper", "bash", "-c", "echo mntr | nc -w 1 127.0.0.1 2181"], @@ -240,7 +243,8 @@ def test_monasca(): try: resp = subprocess.check_output( DOCKER_EXEC + ["monasca", - "ash", "-c", "curl http://localhost:8070/healthcheck"], + "ash", "-c", + "curl http://localhost:$MONASCA_CONTAINER_API_PORT/healthcheck"], stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: @@ -261,6 +265,7 @@ def test_monasca(): def test_grafana(): try: + # Grafana does not allow to change PORT inside the container resp = subprocess.check_output( DOCKER_EXEC + ["grafana", "ash", "-c", "wget -qO- http://localhost:3000/api/health"], @@ -294,6 +299,7 @@ def test_grafana(): def test_elasticsearch(): try: + # Elasticsearch does not allow to change PORT inside the container resp = subprocess.check_output( DOCKER_EXEC + ["elasticsearch", "ash", "-c", "curl -XGET 'localhost:9200/_cluster/health?pretty'"], @@ -338,6 +344,7 @@ def test_elasticsearch_curator(): def test_kibana(): try: + # Kibana does not allow to change PORT inside the container resp = subprocess.check_output( DOCKER_EXEC + ["kibana", "sh", "-c", "wget -qO- http://localhost:5601/api/status"], @@ -369,7 +376,7 @@ def test_kafka(): try: resp = subprocess.check_output( DOCKER_EXEC + ["kafka", - "ash", "-c", "kafka-topics.sh --list --zookeeper zookeeper:2181"], + "ash", "-c", "kafka-topics.sh --list --zookeeper $ZOOKEEPER_CONNECTION_STRING"], stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder ) except subprocess.CalledProcessError as exc: @@ -398,7 +405,7 @@ def test_kafka(): print("'{}' not found in Kafka topics".format(topic)) return 1 - cons_cmd = "kafka-consumer-offset-checker.sh --zookeeper zookeeper:2181 --group {} --topic {}" + cons_cmd = "kafka-consumer-offset-checker.sh --zookeeper $ZOOKEEPER_CONNECTION_STRING --group {} --topic {}" groups_topics = [] if ARGS.metrics: @@ -550,9 +557,9 @@ def test_docker_restarts(): )) return_error = 1 - # Check if service got out of memmory error + # Check if service got out of memory error if parsed_row["OOM"] != "false": - print(" Service '{}' was restarted because of out of memmory error, " + print(" Service '{}' was restarted because of out of memory error, " "please check" .format(parsed_row["NAME"])) return_error = 1 From 4a062455fe0f68c93bf74d645c3bb503814b4de0 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Thu, 26 Mar 2020 16:56:55 +0100 Subject: [PATCH 17/22] Add info to README about help Signed-off-by: Dobroslaw Zybort --- tools/check-health/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/check-health/README.md b/tools/check-health/README.md index a7ea6d8cf..bd559512d 100644 --- a/tools/check-health/README.md +++ b/tools/check-health/README.md @@ -31,6 +31,7 @@ You can use the following arguments to script: | -k | --kafka-lag | 20000 | Report warning when Kafka lag jump over this value | | -r | --max-restarts | -1 | After this number of service restarts issue warning | | -f | --folder | CMM dir | Folder with `.env` and docker-compose config files | +| -h | --help | | Show help | If you start script without `--metrics` and `--logs` arguments both pipelines will be checked. From 7ae739b31bb61ac20d8db47c9d37308c0f847428 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Thu, 26 Mar 2020 19:30:25 +0100 Subject: [PATCH 18/22] Check log-api status Signed-off-by: Dobroslaw Zybort --- tools/check-health/cmm-check-health.py | 30 +++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index 2d3f2ccb4..a247dce61 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -286,8 +286,8 @@ def test_grafana(): print("Grafana returned wrong JSON response: {}".format(resp)) return 1 - if jresp["database"] != "ok": - print("Grafana reported problem with database: {}".format(jresp['database'])) + if ("database" not in jresp) or (jresp["database"] != "ok"): + print("Grafana reported problem with database: {}".format(jresp)) return 1 @@ -366,6 +366,30 @@ def test_kibana(): return 1 +def test_log_api(): + try: + resp = subprocess.check_output( + DOCKER_EXEC + ["log-api", + "sh", "-c", + "curl http://localhost:$MONASCA_CONTAINER_LOG_API_PORT/healthcheck"], + stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + ) + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + try: + jresp = json.loads(resp) + except ValueError as ex: + print("Monasca LOG API returned wrong JSON response: {}".format(resp)) + return 1 + + if ("kafka" not in jresp) or (jresp["kafka"] != "OK"): + print("Monasca LOG API did not return properly: {}".format(jresp)) + return 1 + + ############################################################################### # # Cross pipeline services @@ -596,7 +620,7 @@ def test_docker_restarts(): print_info("Elasticsearch", test_elasticsearch) print_info("Elasticsearch Curator", test_elasticsearch_curator) print_info("Kibana", test_kibana) - # print_info("Monasca Log API", test_log_api) // no healthcheck + print_info("Monasca Log API", test_log_api) # print_info("Monasca Log Agent", test_log_agent) // no healthcheck # print_info("Monasca Logspout", test_logspout) // no healthcheck From f76a8c1005cf804a0867c1ac3c5dd079dd9dd9f4 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Thu, 26 Mar 2020 19:55:57 +0100 Subject: [PATCH 19/22] Properly expand config folder path Signed-off-by: Dobroslaw Zybort --- tools/check-health/cmm-check-health.py | 56 ++++++++++++++------------ 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index a247dce61..90f52b6f9 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -44,13 +44,17 @@ ARGS = parser.parse_args() -dot_env_path = os.path.join(ARGS.folder, ".env") -compose_metrics_path = os.path.join(ARGS.folder, "docker-compose-metric.yml") -compose_logs_path = os.path.join(ARGS.folder, "docker-compose-log.yml") +config_dir = os.path.abspath( + os.path.expanduser(os.path.expandvars(ARGS.folder)) +) + +dot_env_path = os.path.join(config_dir, ".env") +compose_metrics_path = os.path.join(config_dir, "docker-compose-metric.yml") +compose_logs_path = os.path.join(config_dir, "docker-compose-log.yml") # String for using docker-compose to exec commands in all services DOCKER_EXEC = ["docker-compose", - "--project-directory", ARGS.folder, + "--project-directory", config_dir, "--file", compose_metrics_path, "--file", compose_logs_path, "exec"] @@ -86,7 +90,7 @@ def print_info(service_name, test_function): try: resp = subprocess.check_output(["docker-compose", "--version"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -94,9 +98,9 @@ def print_info(service_name, test_function): sys.exit(1) print(resp) -print("Looking for `.env` and configuration files in: {}".format(ARGS.folder)) -if not os.path.isdir(ARGS.folder): - print("Folder does not exists: {}".format(ARGS.folder)) +print("Looking for `.env` and configuration files in: {}".format(config_dir)) +if not os.path.isdir(config_dir): + print("Folder does not exist: {}".format(config_dir)) print("Exiting") sys.exit(1) @@ -107,7 +111,7 @@ def print_info(service_name, test_function): ] for cfile in config_files: if not os.path.exists(cfile): - print("File does not exists: {}".format(cfile)) + print("File does not exist: {}".format(cfile)) print("Exiting") sys.exit(1) @@ -124,7 +128,7 @@ def test_memcached(): resp = subprocess.check_output( DOCKER_EXEC + ["memcached", "ash", "-c", "echo stats | nc -w 1 127.0.0.1 11211"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -141,7 +145,7 @@ def test_influxdb(): dbs = subprocess.check_output( DOCKER_EXEC + ["influxdb", "influx", "-execute", "SHOW DATABASES"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -159,7 +163,7 @@ def test_cadvisor(): resp = subprocess.check_output( DOCKER_EXEC + ["cadvisor", "wget", "--tries=1", "--spider", "http://127.0.0.1:8080/healthz"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -177,7 +181,7 @@ def test_zookeeper(): resp = subprocess.check_output( DOCKER_EXEC + ["zookeeper", "bash", "-c", "echo mntr | nc -w 1 127.0.0.1 2181"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -196,7 +200,7 @@ def test_mysql(): resp = subprocess.check_output( DOCKER_EXEC + ["mysql", "bash", "-c", mysql_conn + "-e 'show databases;'"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -211,7 +215,7 @@ def test_mysql(): max_conn = subprocess.check_output( DOCKER_EXEC + ["mysql", "bash", "-c", mysql_conn + "-e 'select @@max_connections;'"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -223,7 +227,7 @@ def test_mysql(): DOCKER_EXEC + ["mysql", "bash", "-c", mysql_conn + "-e 'SHOW STATUS WHERE `variable_name` = \"Threads_connected\";' | cut -f2"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -245,7 +249,7 @@ def test_monasca(): DOCKER_EXEC + ["monasca", "ash", "-c", "curl http://localhost:$MONASCA_CONTAINER_API_PORT/healthcheck"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -269,7 +273,7 @@ def test_grafana(): resp = subprocess.check_output( DOCKER_EXEC + ["grafana", "ash", "-c", "wget -qO- http://localhost:3000/api/health"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -303,7 +307,7 @@ def test_elasticsearch(): resp = subprocess.check_output( DOCKER_EXEC + ["elasticsearch", "ash", "-c", "curl -XGET 'localhost:9200/_cluster/health?pretty'"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -330,7 +334,7 @@ def test_elasticsearch_curator(): resp = subprocess.check_output( DOCKER_EXEC + ["elasticsearch-curator", "ash", "-c", "curator --dry-run --config /config.yml /action.yml"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -348,7 +352,7 @@ def test_kibana(): resp = subprocess.check_output( DOCKER_EXEC + ["kibana", "sh", "-c", "wget -qO- http://localhost:5601/api/status"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -372,7 +376,7 @@ def test_log_api(): DOCKER_EXEC + ["log-api", "sh", "-c", "curl http://localhost:$MONASCA_CONTAINER_LOG_API_PORT/healthcheck"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -401,7 +405,7 @@ def test_kafka(): resp = subprocess.check_output( DOCKER_EXEC + ["kafka", "ash", "-c", "kafka-topics.sh --list --zookeeper $ZOOKEEPER_CONNECTION_STRING"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -451,7 +455,7 @@ def test_kafka(): resp = subprocess.check_output( DOCKER_EXEC + ["kafka", "ash", "-c", check_cmd], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -497,7 +501,7 @@ def test_docker_events(): ["docker", "events", "--filter", "event=die", "--filter", "event=oom", "--since=24h", "--until=1s"], - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) @@ -551,7 +555,7 @@ def test_docker_restarts(): 'ID={{.ID}} CREATED={{.Created}} RESTARTS={{.RestartCount}} \ OOM={{.State.OOMKilled}} NAME={{.Name}}' \ $(docker ps -aq)"], shell=True, - stderr=subprocess.STDOUT, universal_newlines=True, cwd=ARGS.folder + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) except subprocess.CalledProcessError as exc: print(exc.output) From db821aa86952b7d4e2a2eb409d10720a30e9257b Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Thu, 26 Mar 2020 21:08:20 +0100 Subject: [PATCH 20/22] Handle missing lags in Kafka Signed-off-by: Dobroslaw Zybort --- tools/check-health/cmm-check-health.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index 90f52b6f9..da8ca017f 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -448,6 +448,8 @@ def test_kafka(): ("log-persister", "log-transformed"), ("log-metric", "log-transformed") ]) + + no_lag = False bad_lag = False for row in groups_topics: check_cmd = cons_cmd.format(row[0], row[1]) @@ -465,7 +467,17 @@ def test_kafka(): # Parse output from listing partitions reader = csv.reader(resp.split('\n'), delimiter=' ', skipinitialspace=True) # Remove depreciation waring and row with column titles - partition_list = list(reader)[2:] + p_list = list(reader)[2:] + # Remove all empty lines + partition_list = [x for x in p_list if x] + + # If no lag returned report error + if len(partition_list) == 0: + print(" Lag for group `{}` with topic `{}` not found".format( + row[0], row[1])) + print(" You can print lags with: `{} kafka ash -c '{}'`".format( + " ".join(DOCKER_EXEC), check_cmd)) + no_lag = True lags = [] for partition in partition_list: @@ -474,14 +486,14 @@ def test_kafka(): lags.append(int(partition[5])) biggest_lag = sorted(lags, reverse=True)[0] if biggest_lag > ARGS.kafka_lag: - print("Lag for group `{}`, topic `{}` grow over {}. Biggest lag found: {}".format( + print(" Lag for group `{}`, topic `{}` grow over {}. Biggest lag found: {}".format( row[0], row[1], ARGS.kafka_lag, biggest_lag)) - print("You can print all lags with: `{} kafka ash -c '{}'`".format( + print(" You can print all lags with: `{} kafka ash -c '{}'`".format( " ".join(DOCKER_EXEC), check_cmd)) bad_lag = True - if bad_lag: - # If too big lag was found return with error + if no_lag or bad_lag: + # If no lag or too big lag was found return with error return 1 From 66f2e260654a98a56220c4d01d804d71522b5ded Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Thu, 26 Mar 2020 21:20:35 +0100 Subject: [PATCH 21/22] Update readme with services without healt checks Signed-off-by: Dobroslaw Zybort --- tools/check-health/README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/check-health/README.md b/tools/check-health/README.md index bd559512d..1c146f2d4 100644 --- a/tools/check-health/README.md +++ b/tools/check-health/README.md @@ -62,6 +62,25 @@ python3 cmm-check-health.py -r=20 was created (report warning when more than 10 restarts happen). * Checking if any service was restarted because "out of memory" error. +### Services without health checks + +Following services does not have health checks and are not tested +if they are working properly: + +* Monasca Agent Forwarder +* Monasca Agent Collector +* Monasca Persister +* Monasca Thresh +* Monasca Notification +* Monasca Log Metrics +* Monasca Log Persister +* Monasca Log Transformer +* Monasca Log Agent +* Monasca Log Spout + +They are still tested for too many restarts if `-r` is used. + + ## Checking number of service restarts It's impossible to check exact number of restarts of services in the last From 03007bd013995244ed42998600eb294d04d92f89 Mon Sep 17 00:00:00 2001 From: Dobroslaw Zybort Date: Tue, 31 Mar 2020 18:40:43 +0200 Subject: [PATCH 22/22] Enable checking docker events for restarts Signed-off-by: Dobroslaw Zybort --- tools/check-health/README.md | 9 ++- tools/check-health/cmm-check-health.py | 99 +++++++++++++++++++------- 2 files changed, 79 insertions(+), 29 deletions(-) diff --git a/tools/check-health/README.md b/tools/check-health/README.md index 1c146f2d4..e4d357997 100644 --- a/tools/check-health/README.md +++ b/tools/check-health/README.md @@ -83,12 +83,17 @@ They are still tested for too many restarts if `-r` is used. ## Checking number of service restarts -It's impossible to check exact number of restarts of services in the last -24 hours. Theoretically `docker events` provide this functionality but it's +By default script check Docker events for every out of memory error and +every restart in the last 24 hours. +But it's impossible to check exact number of restarts of services in this +time frame. Theoretically `docker events` provide this functionality but it's limited to last 256 events. In CMM case that have a lot of containers running at the same time on one machine it's useless because it showing only last 4 minutes of events. +You can change number of restarts when warning is reported with `-r` command +line argument. + If you still want to check Docker events use the following command: ```bash diff --git a/tools/check-health/cmm-check-health.py b/tools/check-health/cmm-check-health.py index da8ca017f..a032685b2 100644 --- a/tools/check-health/cmm-check-health.py +++ b/tools/check-health/cmm-check-health.py @@ -7,6 +7,7 @@ import subprocess import sys from argparse import ArgumentParser +from datetime import datetime from shlex import shlex from time import localtime, gmtime, strftime @@ -503,15 +504,55 @@ def test_kafka(): # ############################################################################### -# TODO: Not working properly with 20 Docker containers on one machine. -# Docker events provide only last 256 events and even health checks are logged -# so with all our services working on one machine it's provide us with events -# only from the last 4 minutes... +# Check Docker events for out of memory errors and restarts in last 24 hours. +# Take into account that with all CMM services running on one machine Docker +# will provide events only from last 4 minutes. def test_docker_events(): + # Get oldest event (for reporting) + try: + first_resp = subprocess.Popen( + ["docker", "events", + "--format", "'{{json .}}'", + "--since=24h", "--until=1s"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir + ) + # We want only first line, avoid shell=True. + resp = subprocess.check_output( + ["head", "-1"], stdin=first_resp.stdout, + stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir + ) + except subprocess.CalledProcessError as exc: + print(exc.output) + print(exc) + return 1 + + try: + # Strip new lines and ' from start and end + clean_resp = resp.strip().strip("'") + jresp = json.loads(clean_resp) + except ValueError as ex: + print("Docker events returned wrong JSON format: {}".format(clean_resp)) + return 1 + + # Calculate time of the oldest event + first_event_timestamp = datetime.fromtimestamp(jresp["time"]) + full_events_time_range = datetime.now() - first_event_timestamp + fetr_in_seconds = full_events_time_range.total_seconds() + + print(" Oldest returned event was {:.0f} hours, " + "{:.0f} minutes and {} seconds ago".format( + fetr_in_seconds//3600, + fetr_in_seconds%3600//60, + fetr_in_seconds%60) + ) + + # Get all restarts of Docker containers try: resp = subprocess.check_output( ["docker", "events", "--filter", "event=die", "--filter", "event=oom", + "--format", "'{{json .}}'", "--since=24h", "--until=1s"], stderr=subprocess.STDOUT, universal_newlines=True, cwd=config_dir ) @@ -524,35 +565,39 @@ def test_docker_events(): return_error = None for row in resp.splitlines(): + try: + # Strip new lines and ' from start and end + clean_resp = row.strip().strip("'") + jresp = json.loads(clean_resp) + except ValueError as ex: + print("Docker events returned wrong JSON format: {}".format(clean_resp)) + return 1 - tags = row[row.find('(')+1:-1] - lexer = shlex(tags, posix=True) - # Separate words - lexer.whitespace = ", " - # Split only on whitespace chars - lexer.whitespace_split = True - # "=" is part of the word - lexer.wordchars += "=" - # Separate key=value pairs to dict, split each pair only on first "=" - parsed_row = dict(word.split("=", 1) for word in lexer) - service = parsed_row["com.docker.compose.service"] + service_name = jresp["Actor"]["Attributes"]["name"] # Check for out of memory errors - if "container oom" in row: - print(" Service '{}' got killed in the last 24 hours because " + if "oom" in jresp["Action"]: + print(" Service '{}' got killed because " "of out of memory error, please check" - .format(service)) + .format(service_name)) return_error = 1 - if service not in filtered_list: - filtered_list[service] = {"restarts": 0} - filtered_list[service]["restarts"] += 1 + # Add service to list with number of restarts + if service_name not in filtered_list: + filtered_list[service_name] = {"restarts": 0} + filtered_list[service_name]["restarts"] += 1 for key in filtered_list: - if filtered_list[key]["restarts"] > ARGS.max_restarts: - print(" Service '{}' restarted at least {} times in last " - "24 hours, please check" - .format(key, filtered_list[key]["restarts"])) + if ARGS.max_restarts != -1: + # If number of restarts configured by user + if filtered_list[key]["restarts"] > ARGS.max_restarts: + print(" Service '{}' restarted {} times in checked time range" + .format(key, filtered_list[key]["restarts"])) + return_error = 1 + else: + # Report all number of restarts (not configured by user) + print(" Service '{}' restarted {} times in checked time range" + .format(key, filtered_list[key]["restarts"])) return_error = 1 return return_error @@ -644,8 +689,8 @@ def test_docker_restarts(): if ARGS.metrics or ARGS.logs: print_info("Kafka", test_kafka) -# TODO: Not working properly with running 20 Docker containers on one machine. -# print_info("Docker events", test_docker_events) +# Check Docker events for restarts. +print_info("Docker events", test_docker_events) # Check number of restarts only if user request for it himself. if ARGS.max_restarts > 0: