Files
ETB/ETB-API/monitoring/services/alerting.py
Iliyan Angelov 6b247e5b9f Updates
2025-09-19 11:58:53 +03:00

450 lines
17 KiB
Python

"""
Alerting service for monitoring system
"""
import logging
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from django.utils import timezone
from django.core.mail import send_mail
from django.conf import settings
from django.contrib.auth import get_user_model
from monitoring.models import AlertRule, Alert, SystemMetric, MetricMeasurement, MonitoringTarget
User = get_user_model()
logger = logging.getLogger(__name__)
class AlertEvaluator:
"""Service for evaluating alert conditions"""
def __init__(self):
self.aggregator = None # Will be imported to avoid circular imports
def evaluate_alert_rules(self) -> List[Dict[str, Any]]:
"""Evaluate all active alert rules"""
triggered_alerts = []
active_rules = AlertRule.objects.filter(
status='ACTIVE',
is_enabled=True
)
for rule in active_rules:
try:
if self._evaluate_rule(rule):
alert_data = self._create_alert(rule)
triggered_alerts.append(alert_data)
except Exception as e:
logger.error(f"Failed to evaluate alert rule {rule.name}: {e}")
return triggered_alerts
def _evaluate_rule(self, rule: AlertRule) -> bool:
"""Evaluate if an alert rule condition is met"""
condition = rule.condition
condition_type = condition.get('type')
if condition_type == 'THRESHOLD':
return self._evaluate_threshold_condition(rule, condition)
elif condition_type == 'ANOMALY':
return self._evaluate_anomaly_condition(rule, condition)
elif condition_type == 'AVAILABILITY':
return self._evaluate_availability_condition(rule, condition)
elif condition_type == 'PATTERN':
return self._evaluate_pattern_condition(rule, condition)
else:
logger.warning(f"Unknown condition type: {condition_type}")
return False
def _evaluate_threshold_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
"""Evaluate threshold-based alert conditions"""
if not rule.metric:
return False
# Get latest metric value
latest_measurement = MetricMeasurement.objects.filter(
metric=rule.metric
).order_by('-timestamp').first()
if not latest_measurement:
return False
current_value = float(latest_measurement.value)
threshold_value = condition.get('threshold')
operator = condition.get('operator', '>')
if operator == '>':
return current_value > threshold_value
elif operator == '>=':
return current_value >= threshold_value
elif operator == '<':
return current_value < threshold_value
elif operator == '<=':
return current_value <= threshold_value
elif operator == '==':
return current_value == threshold_value
elif operator == '!=':
return current_value != threshold_value
else:
logger.warning(f"Unknown operator: {operator}")
return False
def _evaluate_anomaly_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
"""Evaluate anomaly-based alert conditions"""
# This would integrate with anomaly detection models
# For now, implement a simple statistical anomaly detection
if not rule.metric:
return False
# Get recent measurements
since = timezone.now() - timedelta(hours=24)
measurements = MetricMeasurement.objects.filter(
metric=rule.metric,
timestamp__gte=since
).order_by('-timestamp')[:100] # Last 100 measurements
if len(measurements) < 10: # Need minimum data points
return False
values = [float(m.value) for m in measurements]
# Calculate mean and standard deviation
mean = sum(values) / len(values)
variance = sum((x - mean) ** 2 for x in values) / len(values)
std_dev = variance ** 0.5
# Check if latest value is an anomaly (more than 2 standard deviations)
latest_value = values[0]
anomaly_threshold = condition.get('threshold', 2.0) # Default 2 sigma
return abs(latest_value - mean) > (anomaly_threshold * std_dev)
def _evaluate_availability_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
"""Evaluate availability-based alert conditions"""
if not rule.target:
return False
# Check if target is in critical state
return rule.target.last_status == 'CRITICAL'
def _evaluate_pattern_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
"""Evaluate pattern-based alert conditions"""
# This would integrate with pattern detection algorithms
# For now, return False as placeholder
return False
def _create_alert(self, rule: AlertRule) -> Dict[str, Any]:
"""Create an alert instance"""
# Get current value for context
current_value = None
threshold_value = None
if rule.metric:
latest_measurement = MetricMeasurement.objects.filter(
metric=rule.metric
).order_by('-timestamp').first()
if latest_measurement:
current_value = float(latest_measurement.value)
threshold_value = rule.metric.critical_threshold
# Create alert
alert = Alert.objects.create(
rule=rule,
title=f"{rule.name} - {rule.severity}",
description=self._generate_alert_description(rule, current_value, threshold_value),
severity=rule.severity,
triggered_value=current_value,
threshold_value=threshold_value,
context_data={
'rule_id': str(rule.id),
'metric_name': rule.metric.name if rule.metric else None,
'target_name': rule.target.name if rule.target else None,
'condition': rule.condition
}
)
return {
'alert_id': str(alert.id),
'rule_name': rule.name,
'severity': rule.severity,
'title': alert.title,
'description': alert.description,
'current_value': current_value,
'threshold_value': threshold_value
}
def _generate_alert_description(self, rule: AlertRule, current_value: Optional[float], threshold_value: Optional[float]) -> str:
"""Generate alert description"""
description = f"Alert rule '{rule.name}' has been triggered.\n"
if rule.metric and current_value is not None:
description += f"Current value: {current_value} {rule.metric.unit}\n"
if threshold_value is not None:
description += f"Threshold: {threshold_value} {rule.metric.unit if rule.metric else ''}\n"
if rule.target:
description += f"Target: {rule.target.name}\n"
description += f"Severity: {rule.severity}\n"
description += f"Time: {timezone.now().strftime('%Y-%m-%d %H:%M:%S')}"
return description
class NotificationService:
"""Service for sending alert notifications"""
def __init__(self):
self.evaluator = AlertEvaluator()
def send_alert_notifications(self, alert_data: Dict[str, Any]) -> Dict[str, Any]:
"""Send notifications for an alert"""
results = {}
# Get alert rule to determine notification channels
rule_id = alert_data.get('rule_id')
if not rule_id:
return {'error': 'No rule ID provided'}
try:
rule = AlertRule.objects.get(id=rule_id)
except AlertRule.DoesNotExist:
return {'error': 'Alert rule not found'}
notification_channels = rule.notification_channels or []
for channel in notification_channels:
try:
if channel['type'] == 'EMAIL':
result = self._send_email_notification(alert_data, channel)
elif channel['type'] == 'SLACK':
result = self._send_slack_notification(alert_data, channel)
elif channel['type'] == 'WEBHOOK':
result = self._send_webhook_notification(alert_data, channel)
else:
result = {'error': f'Unknown notification channel type: {channel["type"]}'}
results[channel['type']] = result
except Exception as e:
logger.error(f"Failed to send {channel['type']} notification: {e}")
results[channel['type']] = {'error': str(e)}
return results
def _send_email_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
"""Send email notification"""
try:
recipients = channel.get('recipients', [])
if not recipients:
return {'error': 'No email recipients configured'}
subject = f"[{alert_data.get('severity', 'ALERT')}] {alert_data.get('title', 'System Alert')}"
message = alert_data.get('description', '')
send_mail(
subject=subject,
message=message,
from_email=settings.DEFAULT_FROM_EMAIL,
recipient_list=recipients,
fail_silently=False
)
return {'status': 'sent', 'recipients': recipients}
except Exception as e:
return {'error': str(e)}
def _send_slack_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
"""Send Slack notification"""
try:
webhook_url = channel.get('webhook_url')
if not webhook_url:
return {'error': 'No Slack webhook URL configured'}
# Create Slack message
color = self._get_slack_color(alert_data.get('severity', 'MEDIUM'))
slack_message = {
"text": alert_data.get('title', 'System Alert'),
"attachments": [
{
"color": color,
"fields": [
{
"title": "Description",
"value": alert_data.get('description', ''),
"short": False
},
{
"title": "Severity",
"value": alert_data.get('severity', 'UNKNOWN'),
"short": True
},
{
"title": "Time",
"value": timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
"short": True
}
]
}
]
}
# Send to Slack (would use requests in real implementation)
# requests.post(webhook_url, json=slack_message)
return {'status': 'sent', 'channel': channel.get('channel', '#alerts')}
except Exception as e:
return {'error': str(e)}
def _send_webhook_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
"""Send webhook notification"""
try:
webhook_url = channel.get('url')
if not webhook_url:
return {'error': 'No webhook URL configured'}
# Prepare webhook payload
payload = {
'alert': alert_data,
'timestamp': timezone.now().isoformat(),
'source': 'ETB-API-Monitoring'
}
# Send webhook (would use requests in real implementation)
# requests.post(webhook_url, json=payload)
return {'status': 'sent', 'url': webhook_url}
except Exception as e:
return {'error': str(e)}
def _get_slack_color(self, severity: str) -> str:
"""Get Slack color based on severity"""
color_map = {
'LOW': 'good',
'MEDIUM': 'warning',
'HIGH': 'danger',
'CRITICAL': 'danger'
}
return color_map.get(severity, 'warning')
class AlertingService:
"""Main alerting service that coordinates alert evaluation and notification"""
def __init__(self):
self.evaluator = AlertEvaluator()
self.notification_service = NotificationService()
def run_alert_evaluation(self) -> Dict[str, Any]:
"""Run alert evaluation and send notifications"""
results = {
'evaluated_rules': 0,
'triggered_alerts': 0,
'notifications_sent': 0,
'errors': []
}
try:
# Evaluate all alert rules
triggered_alerts = self.evaluator.evaluate_alert_rules()
results['triggered_alerts'] = len(triggered_alerts)
# Send notifications for triggered alerts
for alert_data in triggered_alerts:
try:
notification_results = self.notification_service.send_alert_notifications(alert_data)
results['notifications_sent'] += 1
except Exception as e:
logger.error(f"Failed to send notifications for alert {alert_data.get('alert_id')}: {e}")
results['errors'].append(str(e))
# Count evaluated rules
results['evaluated_rules'] = AlertRule.objects.filter(
status='ACTIVE',
is_enabled=True
).count()
except Exception as e:
logger.error(f"Alert evaluation failed: {e}")
results['errors'].append(str(e))
return results
def acknowledge_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
"""Acknowledge an alert"""
try:
alert = Alert.objects.get(id=alert_id)
alert.status = 'ACKNOWLEDGED'
alert.acknowledged_by = user
alert.acknowledged_at = timezone.now()
alert.save()
return {
'status': 'success',
'message': f'Alert {alert_id} acknowledged by {user.username}'
}
except Alert.DoesNotExist:
return {
'status': 'error',
'message': f'Alert {alert_id} not found'
}
except Exception as e:
return {
'status': 'error',
'message': str(e)
}
def resolve_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
"""Resolve an alert"""
try:
alert = Alert.objects.get(id=alert_id)
alert.status = 'RESOLVED'
alert.resolved_by = user
alert.resolved_at = timezone.now()
alert.save()
return {
'status': 'success',
'message': f'Alert {alert_id} resolved by {user.username}'
}
except Alert.DoesNotExist:
return {
'status': 'error',
'message': f'Alert {alert_id} not found'
}
except Exception as e:
return {
'status': 'error',
'message': str(e)
}
def get_active_alerts(self, severity: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get active alerts"""
alerts = Alert.objects.filter(status='TRIGGERED')
if severity:
alerts = alerts.filter(severity=severity)
return [
{
'id': str(alert.id),
'title': alert.title,
'description': alert.description,
'severity': alert.severity,
'triggered_at': alert.triggered_at,
'rule_name': alert.rule.name,
'current_value': float(alert.triggered_value) if alert.triggered_value else None,
'threshold_value': float(alert.threshold_value) if alert.threshold_value else None
}
for alert in alerts.order_by('-triggered_at')
]