Feature 10981: extended Prometheus sidecar to dump alerts rules in config files 97/13197/7
authoraguilard <e.dah.tid@telefonica.com>
Thu, 13 Apr 2023 16:57:57 +0000 (16:57 +0000)
committeraguilard <e.dah.tid@telefonica.com>
Tue, 25 Apr 2023 09:59:22 +0000 (09:59 +0000)
Change-Id: Ic454c894b60d0b2b88b6ea81ca35a0bf4d54ebac
Signed-off-by: aguilard <e.dah.tid@telefonica.com>
docker/Prometheus/Dockerfile
docker/Prometheus/src/app.py

index 228f597..1bdbaf9 100644 (file)
@@ -22,6 +22,8 @@ ENV PROMETHEUS_URL http://prometheus:9090
 ENV MONGODB_URL mongodb://mongo:27017
 ENV PROMETHEUS_CONFIG_FILE /etc/prometheus/prometheus.yml
 ENV PROMETHEUS_BASE_CONFIG_FILE /etc/prometheus_base/prometheus.yml
+ENV PROMETHEUS_ALERTS_FILE /etc/prometheus/osm_alert_rules.yml
+ENV PROMETHEUS_BASE_ALERTS_FILE /etc/prometheus_base/osm_alert_rules.yml
 ENV TARGET_DATABASE osm
 
 WORKDIR /code
index b06f448..36b7b52 100755 (executable)
 # contact: fbravo@whitestack.com
 ##
 
-import os
-import pymongo
-import yaml
 import aiohttp
 import asyncio
+from bson.json_util import dumps
+from bson import ObjectId
 import copy
+from datetime import datetime
 import json
+import os
+import pymongo
 import time
-from bson.json_util import dumps
-from bson import ObjectId
+import yaml
 
 # Env variables
 mongodb_url = os.environ["MONGODB_URL"]
 target_database = os.environ["TARGET_DATABASE"]
 prometheus_config_file = os.environ["PROMETHEUS_CONFIG_FILE"]
 prometheus_base_config_file = os.environ["PROMETHEUS_BASE_CONFIG_FILE"]
+prometheus_alerts_file = os.environ["PROMETHEUS_ALERTS_FILE"]
+prometheus_base_alerts_file = os.environ["PROMETHEUS_BASE_ALERTS_FILE"]
+
 prometheus_url = os.environ["PROMETHEUS_URL"]
 
 
@@ -45,6 +49,10 @@ def get_jobs(client):
     return json.loads(dumps(client[target_database].prometheus_jobs.find({})))
 
 
+def get_alerts(client):
+    return json.loads(dumps(client[target_database].alerts.find({"prometheus_config": {"$exists": True}})))
+
+
 def save_successful_jobs(client, jobs):
     for job in jobs:
         client[target_database].prometheus_jobs.update_one(
@@ -88,6 +96,29 @@ def generate_prometheus_config(prometheus_jobs, config_file_path):
     return config_file_yaml
 
 
+def generate_prometheus_alerts(prometheus_alerts, config_file_path):
+    with open(config_file_path, encoding="utf-8", mode="r") as config_file:
+        config_file_yaml = yaml.safe_load(config_file)
+    if config_file_yaml is None:
+        config_file_yaml = {}
+    if "groups" not in config_file_yaml:
+        config_file_yaml["groups"] = []
+
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    group = {
+        "name": f"_osm_alert_rules_{timestamp}_",
+        "rules": [],
+    }
+    for alert in prometheus_alerts:
+        if "prometheus_config" in alert:
+            group["rules"].append(alert["prometheus_config"])
+
+    if group["rules"]:
+        config_file_yaml["groups"].append(group)
+
+    return config_file_yaml
+
+
 async def reload_prometheus_config(prom_url):
     async with aiohttp.ClientSession() as session:
         async with session.post(prom_url + "/-/reload") as resp:
@@ -131,7 +162,7 @@ async def validate_configuration(prom_url, new_config):
 
 async def main_task(client):
     stored_jobs = get_jobs(client)
-    print(f"Jobs detected : {len(stored_jobs):d}")
+    print(f"Jobs detected: {len(stored_jobs):d}")
     generated_prometheus_config = generate_prometheus_config(
         stored_jobs, prometheus_base_config_file
     )
@@ -141,6 +172,20 @@ async def main_task(client):
     print(yaml.safe_dump(generated_prometheus_config))
     config_file.write(yaml.safe_dump(generated_prometheus_config))
     config_file.close()
+
+    if os.path.isfile(prometheus_base_alerts_file):
+        stored_alerts = get_alerts(client)
+        print(f"Alerts read: {len(stored_alerts):d}")
+        generated_prometheus_alerts = generate_prometheus_alerts(
+            stored_alerts, prometheus_base_alerts_file
+        )
+        print(f"Writing new alerts file to {prometheus_alerts_file}")
+        config_file = open(prometheus_alerts_file, "w")
+        config_file.truncate(0)
+        print(yaml.safe_dump(generated_prometheus_alerts))
+        config_file.write(yaml.safe_dump(generated_prometheus_alerts))
+        config_file.close()
+
     print("New config written, updating prometheus")
     update_resp = await reload_prometheus_config(prometheus_url)
     is_valid = await validate_configuration(prometheus_url, generated_prometheus_config)
@@ -161,9 +206,9 @@ async def main():
     # Initial loop. First refresh of prometheus config file
     first_refresh_completed = False
     tries = 1
-    while tries <= 3:
+    while tries <= 3 and first_refresh_completed == False:
         try:
-            print("Refreshing prometheus config file for first time")
+            print("Generating prometheus config files")
             await main_task(client)
             first_refresh_completed = True
         except Exception as error:
@@ -179,23 +224,21 @@ async def main():
     while True:
         try:
             # Needs mongodb in replica mode as this feature relies in OpLog
-            change_stream = client[target_database].prometheus_jobs.watch(
+            change_stream = client[target_database].watch(
                 [
                     {
                         "$match": {
-                            # If you want to modify a particular job,
-                            # delete and insert it again
-                            "operationType": {"$in": ["insert", "delete"]}
+                            "operationType": {"$in": ["insert", "delete"]},
+                            "ns.coll": { "$in": ["prometheus_jobs", "alerts"]},
                         }
                     }
                 ]
             )
 
             # Single thread, no race conditions and ops are queued up in order
-            print("Listening to changes in prometheus jobs collection")
+            print("Listening to changes in prometheus jobs and alerts collections")
             for change in change_stream:
-                print("Change detected, updating prometheus config")
-                print(f"{change}")
+                print("Changes detected, updating prometheus config")
                 await main_task(client)
                 print()
         except Exception as error: