From: aguilard Date: Thu, 13 Apr 2023 16:57:57 +0000 (+0000) Subject: Feature 10981: extended Prometheus sidecar to dump alerts rules in config files X-Git-Tag: release-v14.0-start~79 X-Git-Url: https://osm.etsi.org/gitweb/?a=commitdiff_plain;h=f91f9c32f0d3cd4011a288235d0e4e1ba44b85d5;p=osm%2Fdevops.git Feature 10981: extended Prometheus sidecar to dump alerts rules in config files Change-Id: Ic454c894b60d0b2b88b6ea81ca35a0bf4d54ebac Signed-off-by: aguilard --- diff --git a/docker/Prometheus/Dockerfile b/docker/Prometheus/Dockerfile index 228f5979..1bdbaf9a 100644 --- a/docker/Prometheus/Dockerfile +++ b/docker/Prometheus/Dockerfile @@ -22,6 +22,8 @@ ENV PROMETHEUS_URL http://prometheus:9090 ENV MONGODB_URL mongodb://mongo:27017 ENV PROMETHEUS_CONFIG_FILE /etc/prometheus/prometheus.yml ENV PROMETHEUS_BASE_CONFIG_FILE /etc/prometheus_base/prometheus.yml +ENV PROMETHEUS_ALERTS_FILE /etc/prometheus/osm_alert_rules.yml +ENV PROMETHEUS_BASE_ALERTS_FILE /etc/prometheus_base/osm_alert_rules.yml ENV TARGET_DATABASE osm WORKDIR /code diff --git a/docker/Prometheus/src/app.py b/docker/Prometheus/src/app.py index b06f4486..36b7b52e 100755 --- a/docker/Prometheus/src/app.py +++ b/docker/Prometheus/src/app.py @@ -22,22 +22,26 @@ # contact: fbravo@whitestack.com ## -import os -import pymongo -import yaml import aiohttp import asyncio +from bson.json_util import dumps +from bson import ObjectId import copy +from datetime import datetime import json +import os +import pymongo import time -from bson.json_util import dumps -from bson import ObjectId +import yaml # Env variables mongodb_url = os.environ["MONGODB_URL"] target_database = os.environ["TARGET_DATABASE"] prometheus_config_file = os.environ["PROMETHEUS_CONFIG_FILE"] prometheus_base_config_file = os.environ["PROMETHEUS_BASE_CONFIG_FILE"] +prometheus_alerts_file = os.environ["PROMETHEUS_ALERTS_FILE"] +prometheus_base_alerts_file = os.environ["PROMETHEUS_BASE_ALERTS_FILE"] + prometheus_url = os.environ["PROMETHEUS_URL"] @@ -45,6 +49,10 @@ def get_jobs(client): return json.loads(dumps(client[target_database].prometheus_jobs.find({}))) +def get_alerts(client): + return json.loads(dumps(client[target_database].alerts.find({"prometheus_config": {"$exists": True}}))) + + def save_successful_jobs(client, jobs): for job in jobs: client[target_database].prometheus_jobs.update_one( @@ -88,6 +96,29 @@ def generate_prometheus_config(prometheus_jobs, config_file_path): return config_file_yaml +def generate_prometheus_alerts(prometheus_alerts, config_file_path): + with open(config_file_path, encoding="utf-8", mode="r") as config_file: + config_file_yaml = yaml.safe_load(config_file) + if config_file_yaml is None: + config_file_yaml = {} + if "groups" not in config_file_yaml: + config_file_yaml["groups"] = [] + + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + group = { + "name": f"_osm_alert_rules_{timestamp}_", + "rules": [], + } + for alert in prometheus_alerts: + if "prometheus_config" in alert: + group["rules"].append(alert["prometheus_config"]) + + if group["rules"]: + config_file_yaml["groups"].append(group) + + return config_file_yaml + + async def reload_prometheus_config(prom_url): async with aiohttp.ClientSession() as session: async with session.post(prom_url + "/-/reload") as resp: @@ -131,7 +162,7 @@ async def validate_configuration(prom_url, new_config): async def main_task(client): stored_jobs = get_jobs(client) - print(f"Jobs detected : {len(stored_jobs):d}") + print(f"Jobs detected: {len(stored_jobs):d}") generated_prometheus_config = generate_prometheus_config( stored_jobs, prometheus_base_config_file ) @@ -141,6 +172,20 @@ async def main_task(client): print(yaml.safe_dump(generated_prometheus_config)) config_file.write(yaml.safe_dump(generated_prometheus_config)) config_file.close() + + if os.path.isfile(prometheus_base_alerts_file): + stored_alerts = get_alerts(client) + print(f"Alerts read: {len(stored_alerts):d}") + generated_prometheus_alerts = generate_prometheus_alerts( + stored_alerts, prometheus_base_alerts_file + ) + print(f"Writing new alerts file to {prometheus_alerts_file}") + config_file = open(prometheus_alerts_file, "w") + config_file.truncate(0) + print(yaml.safe_dump(generated_prometheus_alerts)) + config_file.write(yaml.safe_dump(generated_prometheus_alerts)) + config_file.close() + print("New config written, updating prometheus") update_resp = await reload_prometheus_config(prometheus_url) is_valid = await validate_configuration(prometheus_url, generated_prometheus_config) @@ -161,9 +206,9 @@ async def main(): # Initial loop. First refresh of prometheus config file first_refresh_completed = False tries = 1 - while tries <= 3: + while tries <= 3 and first_refresh_completed == False: try: - print("Refreshing prometheus config file for first time") + print("Generating prometheus config files") await main_task(client) first_refresh_completed = True except Exception as error: @@ -179,23 +224,21 @@ async def main(): while True: try: # Needs mongodb in replica mode as this feature relies in OpLog - change_stream = client[target_database].prometheus_jobs.watch( + change_stream = client[target_database].watch( [ { "$match": { - # If you want to modify a particular job, - # delete and insert it again - "operationType": {"$in": ["insert", "delete"]} + "operationType": {"$in": ["insert", "delete"]}, + "ns.coll": { "$in": ["prometheus_jobs", "alerts"]}, } } ] ) # Single thread, no race conditions and ops are queued up in order - print("Listening to changes in prometheus jobs collection") + print("Listening to changes in prometheus jobs and alerts collections") for change in change_stream: - print("Change detected, updating prometheus config") - print(f"{change}") + print("Changes detected, updating prometheus config") await main_task(client) print() except Exception as error: