blob: 1a4793fdcd9dc289526d2a183b6b2ae8ffb63f66 [file] [log] [blame]
tiernof800c5c2020-06-30 13:24:17 +00001# -*- coding: utf-8 -*-
2
3##
4# Copyright 2020 Telefonica S.A.
5#
6# Licensed under the Apache License, Version 2.0 (the "License"); you may
7# not use this file except in compliance with the License. You may obtain
8# a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15# License for the specific language governing permissions and limitations
16# under the License.
17##
18
19import asyncio
20from time import time
21import logging
22import aiohttp
23import yaml
24import os
25from osm_lcm.lcm_utils import LcmException
26from osm_common.dbbase import DbException
tierno89f82902020-07-03 14:52:28 +000027from jinja2 import Template, TemplateError, TemplateNotFound, TemplateSyntaxError
tiernof800c5c2020-06-30 13:24:17 +000028
29__author__ = "Alfonso Tierno <alfonso.tiernosepulveda@telefonica.com>"
30
31initial_prometheus_data = {
32 "_id": "prometheus",
33 "_admin": {
34 "locked_at": 0,
35 "locked_by": None,
36 "modified": 1593445184, # 2020-06-29
37 "created": 1593445184,
38 "version": "1.0" # to allow future version updates
39 },
tierno89f82902020-07-03 14:52:28 +000040 'scrape_configs': { # Dictionary at database. Converted to list before sending to prometheus
41 'mon_exporter': {'static_configs': [{'targets': ['mon:8000']}], 'job_name': 'mon_exporter'},
42 },
tiernof800c5c2020-06-30 13:24:17 +000043 'global': {'evaluation_interval': '15s', 'scrape_interval': '15s'},
44 'rule_files': None,
45 'alerting': {'alertmanagers': [{'static_configs': [{'targets': None}]}]}
46}
47
48
tierno89f82902020-07-03 14:52:28 +000049class Prometheus:
tiernof800c5c2020-06-30 13:24:17 +000050 """
51 Implements a class to update Prometheus
52 """
53
54 PROMETHEUS_LOCKED_TIME = 120
55
56 def __init__(self, config, worker_id, db, loop, logger=None):
57 self.worker_id = worker_id
58 self.db = db
59 self.loop = loop
tierno89f82902020-07-03 14:52:28 +000060 self.logger = logger or logging.getLogger("lcm.prometheus")
tiernof800c5c2020-06-30 13:24:17 +000061 self.server = config["uri"]
62 self.path = config["path"]
63 if not self.path.endswith("/"):
64 self.path += "/"
65 self.cfg_file = self.path + "prometheus.yml"
66 self.cfg_file_backup = self.path + "prometheus.yml-backup"
67
tierno89f82902020-07-03 14:52:28 +000068 @staticmethod
69 def parse_job(job_data: str, variables: dict) -> dict:
70 try:
71 template = Template(job_data)
72 job_parsed = template.render(variables or {})
73 return yaml.safe_load(job_parsed)
74 except (TemplateError, TemplateNotFound, TemplateSyntaxError) as e:
75 # TODO yaml exceptions
76 raise LcmException("Error parsing Jinja2 to prometheus job. job_data={}, variables={}. Error={}".format(
77 job_data, variables, e))
78
tiernof800c5c2020-06-30 13:24:17 +000079 async def start(self):
80 for retry in range(4):
81 try:
tierno89f82902020-07-03 14:52:28 +000082 # self.logger("Starting prometheus ")
tiernof800c5c2020-06-30 13:24:17 +000083 # read from database
tierno89f82902020-07-03 14:52:28 +000084 prometheus_data = self.db.get_one("admin", {"_id": "prometheus"}, fail_on_empty=False)
tiernof800c5c2020-06-30 13:24:17 +000085 if not prometheus_data:
86 self.logger.info("Init db.admin.prometheus content")
87 self.db.create("admin", initial_prometheus_data)
88 # send database config file to prometheus. Ignore loading errors, as prometheus may be starting
89 # but at least an initial configuration file is set
90 await self.update()
tierno89f82902020-07-03 14:52:28 +000091 return
tiernof800c5c2020-06-30 13:24:17 +000092 except DbException as e:
93 if retry == 3:
94 raise LcmException("Max retries trying to init prometheus configuration: {}".format(e))
95 await asyncio.sleep(5, loop=self.loop)
96
tierno89f82902020-07-03 14:52:28 +000097 async def update(self, add_jobs: dict = None, remove_jobs: list = None) -> bool:
98 """
99
100 :param add_jobs: dictionary with {job_id_1: job_content, job_id_2: job_content}
101 :param remove_jobs: list with jobs to remove [job_id_1, job_id_2]
102 :return: result. If false prometheus denies this configuration. Exception on error
103 """
tiernof800c5c2020-06-30 13:24:17 +0000104 for retry in range(4):
105 result = True
106 if retry: # first time do not wait
107 await asyncio.sleep(self.PROMETHEUS_LOCKED_TIME / 2, loop=self.loop)
tierno89f82902020-07-03 14:52:28 +0000108
tiernof800c5c2020-06-30 13:24:17 +0000109 # lock database
110 now = time()
111 if not self.db.set_one(
112 "admin",
113 q_filter={"_id": "prometheus", "_admin.locked_at.lt": now - self.PROMETHEUS_LOCKED_TIME},
114 update_dict={"_admin.locked_at": now, "_admin.locked_by": self.worker_id},
115 fail_on_empty=False):
116 continue
117 # read database
118 prometheus_data = self.db.get_one("admin", {"_id": "prometheus"})
tierno89f82902020-07-03 14:52:28 +0000119 update_dict = {"_admin.locked_at": 0,
120 "_admin.locked_by": None}
tiernof800c5c2020-06-30 13:24:17 +0000121
122 # Make changes from prometheus_incremental
tierno89f82902020-07-03 14:52:28 +0000123 push_dict = pull_dict = None
tiernof800c5c2020-06-30 13:24:17 +0000124 if add_jobs or remove_jobs:
tierno89f82902020-07-03 14:52:28 +0000125 log_text_list = []
tiernof800c5c2020-06-30 13:24:17 +0000126 if add_jobs:
tierno89f82902020-07-03 14:52:28 +0000127 log_text_list.append("adding jobs: {}".format(list(add_jobs.keys())))
128 prometheus_data["scrape_configs"].update(add_jobs)
129 push_dict = {"scrape_configs." + job_id: job_data for job_id, job_data in add_jobs.items()}
tiernof800c5c2020-06-30 13:24:17 +0000130 elif remove_jobs:
tierno89f82902020-07-03 14:52:28 +0000131 log_text_list.append("removing jobs: {}".format(list(remove_jobs)))
132 for job_id in remove_jobs:
133 prometheus_data["scrape_configs"].pop(job_id, None)
134 pull_dict = {"scrape_configs." + job_id: None for job_id in remove_jobs}
135 self.logger.debug(". ".join(log_text_list))
136
137 if not await self.send_data(prometheus_data):
138 push_dict = pull_dict = None
139 result = False
tiernof800c5c2020-06-30 13:24:17 +0000140
141 # unblock database
tierno89f82902020-07-03 14:52:28 +0000142 if push_dict:
143 update_dict.update(push_dict)
144 if push_dict or pull_dict:
145 update_dict["_admin.modified_at"] = now
tiernof800c5c2020-06-30 13:24:17 +0000146 if not self.db.set_one(
147 "admin", {"_id": "prometheus", "_admin.locked_at": now, "_admin.locked_by": self.worker_id},
tierno89f82902020-07-03 14:52:28 +0000148 update_dict=update_dict, unset=pull_dict, fail_on_empty=False):
tiernof800c5c2020-06-30 13:24:17 +0000149 continue
150 return result
151 raise LcmException("Cannot update prometheus database. Reached max retries")
152
153 async def send_data(self, new_config):
154 restore_backup = False
tierno89f82902020-07-03 14:52:28 +0000155 del new_config["_id"]
156 del new_config["_admin"]
157 new_scrape_configs = []
tierno4fa7f8e2020-07-08 15:33:55 +0000158
tierno89f82902020-07-03 14:52:28 +0000159 # generate a list with the values of scrape_configs
160 for scrape_config in new_config["scrape_configs"].values():
161 scrape_config = scrape_config.copy()
162 # remove nsr_id metadata from scrape_configs
163 scrape_config.pop("nsr_id", None)
164 new_scrape_configs.append(scrape_config)
165 new_config["scrape_configs"] = new_scrape_configs
166
tiernof800c5c2020-06-30 13:24:17 +0000167 try:
168 if os.path.exists(self.cfg_file):
169 os.rename(self.cfg_file, self.cfg_file_backup)
170 restore_backup = True
171 with open(self.cfg_file, "w+") as f:
tierno89f82902020-07-03 14:52:28 +0000172 yaml.safe_dump(new_config, f, indent=4, default_flow_style=False)
173 # self.logger.debug("new configuration: {}".format(yaml.safe_dump(new_config, indent=4,
174 # default_flow_style=False)))
tiernof800c5c2020-06-30 13:24:17 +0000175 async with aiohttp.ClientSession() as session:
tierno89f82902020-07-03 14:52:28 +0000176 async with session.post(self.server + "-/reload") as resp:
tiernof800c5c2020-06-30 13:24:17 +0000177 if resp.status > 204:
tierno89f82902020-07-03 14:52:28 +0000178 raise LcmException(await resp.text())
tiernof800c5c2020-06-30 13:24:17 +0000179 await asyncio.sleep(5, loop=self.loop)
tierno89f82902020-07-03 14:52:28 +0000180 # If prometheus does not admit this configuration, remains with the old one
181 # Then, to check if the configuration has been accepted, get the configuration from prometheus
182 # and compares with the inserted one
183 async with session.get(self.server + "api/v1/status/config") as resp:
tiernof800c5c2020-06-30 13:24:17 +0000184 if resp.status > 204:
tierno89f82902020-07-03 14:52:28 +0000185 raise LcmException(await resp.text())
186 current_config = await resp.json()
tiernof800c5c2020-06-30 13:24:17 +0000187 if not self._check_configuration_equal(current_config, new_config):
188 return False
189 else:
190 restore_backup = False
191 return True
192 except Exception as e:
tierno89f82902020-07-03 14:52:28 +0000193 self.logger.error("Error updating prometheus configuration url={}: {}".format(self.server, e))
tiernof800c5c2020-06-30 13:24:17 +0000194 return False
195 finally:
196 if restore_backup:
tierno89f82902020-07-03 14:52:28 +0000197 try:
198 os.rename(self.cfg_file_backup, self.cfg_file)
199 except Exception as e:
200 self.logger.critical("Exception while rolling back: {}".format(e))
tiernof800c5c2020-06-30 13:24:17 +0000201
tierno89f82902020-07-03 14:52:28 +0000202 def _check_configuration_equal(self, current_config, expected_config):
203 try:
204 # self.logger.debug("Comparing current_config='{}' with expected_config='{}'".format(current_config,
205 # expected_config))
206 current_config_yaml = yaml.safe_load(current_config['data']['yaml'])
207 current_jobs = [j["job_name"] for j in current_config_yaml["scrape_configs"]]
208 expected_jobs = [j["job_name"] for j in expected_config["scrape_configs"]]
209 return current_jobs == expected_jobs
210 except Exception as e:
211 self.logger.error("Invalid obtained prometheus status. Error: '{}'. Obtained data: '{}'".format(
212 e, current_config))
213 # if format is not understood, cannot be compared, assume it is ok
214 return True