matrix-alertbot/matrix_alertbot/webhook.py

249 lines
8.8 KiB
Python
Raw Normal View History

2022-07-08 22:37:09 +02:00
from __future__ import annotations
import logging
2024-01-22 11:35:13 +01:00
from typing import List
import prometheus_client
from aiohttp import ClientError, web, web_request
from aiohttp.abc import AbstractAccessLogger
2022-07-09 12:43:18 +02:00
from aiohttp_prometheus_exporter.handler import metrics
from aiohttp_prometheus_exporter.middleware import prometheus_middleware_factory
2022-07-06 00:54:13 +02:00
from diskcache import Cache
2024-01-22 11:35:13 +01:00
from nio.exceptions import LocalProtocolError, SendRetryError
2022-07-28 17:39:47 +02:00
from matrix_alertbot.alert import Alert, AlertRenderer
2022-08-08 00:28:36 +02:00
from matrix_alertbot.alertmanager import AlertmanagerClient
from matrix_alertbot.chat_functions import send_text_to_room
from matrix_alertbot.config import Config
2022-08-08 00:28:36 +02:00
from matrix_alertbot.errors import (
AlertmanagerError,
2024-01-22 11:35:13 +01:00
MatrixClientError,
2022-08-08 00:28:36 +02:00
SilenceExtendError,
SilenceNotFoundError,
)
2024-01-22 11:35:13 +01:00
from matrix_alertbot.matrix import MatrixClientPool
logger = logging.getLogger(__name__)
routes = web.RouteTableDef()
class AccessLogger(AbstractAccessLogger):
def log(
self,
request: web_request.BaseRequest,
response: web.StreamResponse,
time: float,
) -> None:
if request is None:
remote_address = "-"
request_info = "-"
referer_header = "-"
user_agent_header = "-"
else:
if request.remote is None:
remote_address = "-"
else:
remote_address = request.remote
request_info = (
f"{request.method} {request.path_qs} "
f"HTTP/{request.version.major}.{request.version.minor}"
)
referer_header = request.headers.get("Referer", "-")
user_agent_header = request.headers.get("User-Agent", "-")
self.logger.debug(
f'{remote_address} "{request_info}" {response.status} '
f'{response.body_length} "{referer_header}" "{user_agent_header}"'
)
2022-07-12 18:19:52 +02:00
@routes.get("/health")
async def get_health(request: web_request.Request) -> web.Response:
return web.Response(status=200)
@routes.post("/alerts/{room_id}")
async def create_alerts(request: web_request.Request) -> web.Response:
data = await request.json()
room_id = request.match_info["room_id"]
2022-07-06 00:54:13 +02:00
config: Config = request.app["config"]
if room_id not in config.allowed_rooms:
2023-01-29 21:44:24 +01:00
logger.error(f"Cannot send alerts to room ID {room_id}.")
2022-07-28 17:39:47 +02:00
return web.Response(
status=401, body=f"Cannot send alerts to room ID {room_id}."
)
if "alerts" not in data:
logger.error("Received data without 'alerts' key")
return web.Response(status=400, body="Data must contain 'alerts' key.")
2022-08-08 00:28:36 +02:00
alert_dicts = data["alerts"]
if not isinstance(data["alerts"], list):
2024-01-22 11:43:11 +01:00
alerts_type = alert_dicts.__class__.__name__
logger.error(f"Received data with invalid alerts type '{alerts_type}'.")
return web.Response(
status=400, body=f"Alerts must be a list, got '{alerts_type}'."
)
2022-08-08 00:28:36 +02:00
logger.info(f"Received {len(alert_dicts)} alerts for room ID {room_id}: {data}")
if len(data["alerts"]) == 0:
return web.Response(status=400, body="Alerts cannot be empty.")
2024-01-22 11:35:13 +01:00
alerts: List[Alert] = []
for alert_dict in alert_dicts:
try:
2024-01-22 11:35:13 +01:00
alert = Alert.from_dict(alert_dict)
except KeyError as e:
logger.error(f"Cannot parse alert dict: {e}")
2024-01-22 11:35:13 +01:00
return web.Response(status=400, body=f"Invalid alert: {alert_dict}.")
2022-08-08 00:28:36 +02:00
alerts.append(alert)
2022-08-08 00:28:36 +02:00
for alert in alerts:
try:
2022-08-08 00:28:36 +02:00
await create_alert(alert, room_id, request)
except AlertmanagerError as e:
logger.error(
f"An error occured with Alertmanager when handling alert with fingerprint {alert.fingerprint}: {e}"
)
return web.Response(
status=500,
body=f"An error occured with Alertmanager when handling alert with fingerprint {alert.fingerprint}.",
)
2022-08-08 12:38:09 +02:00
except (SendRetryError, LocalProtocolError, ClientError) as e:
logger.error(
2023-01-29 21:44:24 +01:00
f"Unable to send alert {alert.fingerprint} to Matrix room {room_id}: {e}"
)
return web.Response(
status=500,
body=f"An error occured when sending alert with fingerprint '{alert.fingerprint}' to Matrix room.",
2023-01-29 21:44:24 +01:00
)
2024-01-22 11:35:13 +01:00
except MatrixClientError as e:
logger.error(
f"Unable to send alert {alert.fingerprint} to Matrix room {room_id}: {e}"
)
return web.Response(
status=500,
body=f"An error occured when sending alert with fingerprint '{alert.fingerprint}' to Matrix room.",
)
2023-01-29 21:44:24 +01:00
except Exception as e:
logger.error(
f"Unable to send alert {alert.fingerprint} to Matrix room {room_id}: {e}"
)
return web.Response(
status=500,
body=f"An exception occured when sending alert with fingerprint '{alert.fingerprint}' to Matrix room.",
)
2022-08-08 00:28:36 +02:00
return web.Response(status=200)
async def create_alert(
alert: Alert, room_id: str, request: web_request.Request
) -> None:
alertmanager_client: AlertmanagerClient = request.app["alertmanager_client"]
alert_renderer: AlertRenderer = request.app["alert_renderer"]
2024-01-22 11:35:13 +01:00
matrix_client_pool: MatrixClientPool = request.app["matrix_client_pool"]
2022-08-08 00:28:36 +02:00
cache: Cache = request.app["cache"]
config: Config = request.app["config"]
if config.dm_select_label and config.dm_select_label in alert.labels:
if alert.match_all_labels(config.dm_filter_labels):
dm_select_value = alert.labels[config.dm_select_label]
if dm_select_value not in config.dm_users:
logger.warning(
f"Cannot find user with label {config.dm_select_label}={dm_select_value}"
)
return
user_id = config.dm_users[dm_select_value]
if user_id not in matrix_client_pool.dm_rooms:
logger.warning(f"Cannot find a matrix room for user {user_id}")
return
room_id = matrix_client_pool.dm_rooms[user_id]
2022-08-08 00:28:36 +02:00
if alert.firing:
try:
silence_id = await alertmanager_client.update_silence(alert.fingerprint)
logger.debug(
f"Extended silence ID {silence_id} for alert with fingerprint {alert.fingerprint}"
)
return
except SilenceNotFoundError as e:
logger.debug(
f"Unable to extend silence for alert with fingerprint {alert.fingerprint}: {e}"
)
cache.delete(alert.fingerprint)
except SilenceExtendError as e:
logger.debug(
f"Unable to extend silence for alert with fingerprint {alert.fingerprint}: {e}"
)
2022-08-08 00:28:36 +02:00
plaintext = alert_renderer.render(alert, html=False)
html = alert_renderer.render(alert, html=True)
2024-01-22 11:35:13 +01:00
if matrix_client_pool.matrix_client is not None:
event = await send_text_to_room(
matrix_client_pool.matrix_client, room_id, plaintext, html, notice=False
)
else:
raise MatrixClientError("No matrix client available")
2022-08-08 00:28:36 +02:00
if alert.firing:
cache.set(event.event_id, alert.fingerprint, expire=config.cache_expire_time)
else:
cache.delete(alert.fingerprint)
2022-07-08 22:46:04 +02:00
class Webhook:
2022-08-08 00:28:36 +02:00
def __init__(
self,
2024-01-22 11:35:13 +01:00
matrix_client_pool: MatrixClientPool,
2022-08-08 00:28:36 +02:00
alertmanager_client: AlertmanagerClient,
cache: Cache,
config: Config,
) -> None:
self.app = web.Application(logger=logger)
2024-01-22 11:35:13 +01:00
self.app["matrix_client_pool"] = matrix_client_pool
2022-08-08 00:28:36 +02:00
self.app["alertmanager_client"] = alertmanager_client
2022-07-06 00:54:13 +02:00
self.app["config"] = config
self.app["cache"] = cache
2022-07-28 17:39:47 +02:00
self.app["alert_renderer"] = AlertRenderer(config.template_dir)
self.app.add_routes(routes)
2022-07-09 12:43:18 +02:00
prometheus_registry = prometheus_client.CollectorRegistry(auto_describe=True)
self.app.middlewares.append(
prometheus_middleware_factory(registry=prometheus_registry)
)
2022-07-09 12:43:18 +02:00
self.app.router.add_get("/metrics", metrics())
self.runner = web.AppRunner(self.app, access_log_class=AccessLogger)
self.config = config
self.address = config.address
self.port = config.port
self.socket = config.socket
async def start(self) -> None:
await self.runner.setup()
site: web.BaseSite
if self.address and self.port:
site = web.TCPSite(self.runner, self.address, self.port)
2022-07-28 17:39:47 +02:00
logger.info(f"Listening on {self.address}:{self.port}")
elif self.socket:
site = web.UnixSite(self.runner, self.socket)
2022-07-28 17:39:47 +02:00
logger.info(f"Listening on unix://{self.socket}")
await site.start()
async def close(self) -> None:
await self.runner.cleanup()