Updates
This commit is contained in:
449
ETB-API/monitoring/services/alerting.py
Normal file
449
ETB-API/monitoring/services/alerting.py
Normal file
@@ -0,0 +1,449 @@
|
||||
"""
|
||||
Alerting service for monitoring system
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from django.utils import timezone
|
||||
from django.core.mail import send_mail
|
||||
from django.conf import settings
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
from monitoring.models import AlertRule, Alert, SystemMetric, MetricMeasurement, MonitoringTarget
|
||||
|
||||
User = get_user_model()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AlertEvaluator:
|
||||
"""Service for evaluating alert conditions"""
|
||||
|
||||
def __init__(self):
|
||||
self.aggregator = None # Will be imported to avoid circular imports
|
||||
|
||||
def evaluate_alert_rules(self) -> List[Dict[str, Any]]:
|
||||
"""Evaluate all active alert rules"""
|
||||
triggered_alerts = []
|
||||
|
||||
active_rules = AlertRule.objects.filter(
|
||||
status='ACTIVE',
|
||||
is_enabled=True
|
||||
)
|
||||
|
||||
for rule in active_rules:
|
||||
try:
|
||||
if self._evaluate_rule(rule):
|
||||
alert_data = self._create_alert(rule)
|
||||
triggered_alerts.append(alert_data)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to evaluate alert rule {rule.name}: {e}")
|
||||
|
||||
return triggered_alerts
|
||||
|
||||
def _evaluate_rule(self, rule: AlertRule) -> bool:
|
||||
"""Evaluate if an alert rule condition is met"""
|
||||
condition = rule.condition
|
||||
condition_type = condition.get('type')
|
||||
|
||||
if condition_type == 'THRESHOLD':
|
||||
return self._evaluate_threshold_condition(rule, condition)
|
||||
elif condition_type == 'ANOMALY':
|
||||
return self._evaluate_anomaly_condition(rule, condition)
|
||||
elif condition_type == 'AVAILABILITY':
|
||||
return self._evaluate_availability_condition(rule, condition)
|
||||
elif condition_type == 'PATTERN':
|
||||
return self._evaluate_pattern_condition(rule, condition)
|
||||
else:
|
||||
logger.warning(f"Unknown condition type: {condition_type}")
|
||||
return False
|
||||
|
||||
def _evaluate_threshold_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
|
||||
"""Evaluate threshold-based alert conditions"""
|
||||
if not rule.metric:
|
||||
return False
|
||||
|
||||
# Get latest metric value
|
||||
latest_measurement = MetricMeasurement.objects.filter(
|
||||
metric=rule.metric
|
||||
).order_by('-timestamp').first()
|
||||
|
||||
if not latest_measurement:
|
||||
return False
|
||||
|
||||
current_value = float(latest_measurement.value)
|
||||
threshold_value = condition.get('threshold')
|
||||
operator = condition.get('operator', '>')
|
||||
|
||||
if operator == '>':
|
||||
return current_value > threshold_value
|
||||
elif operator == '>=':
|
||||
return current_value >= threshold_value
|
||||
elif operator == '<':
|
||||
return current_value < threshold_value
|
||||
elif operator == '<=':
|
||||
return current_value <= threshold_value
|
||||
elif operator == '==':
|
||||
return current_value == threshold_value
|
||||
elif operator == '!=':
|
||||
return current_value != threshold_value
|
||||
else:
|
||||
logger.warning(f"Unknown operator: {operator}")
|
||||
return False
|
||||
|
||||
def _evaluate_anomaly_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
|
||||
"""Evaluate anomaly-based alert conditions"""
|
||||
# This would integrate with anomaly detection models
|
||||
# For now, implement a simple statistical anomaly detection
|
||||
|
||||
if not rule.metric:
|
||||
return False
|
||||
|
||||
# Get recent measurements
|
||||
since = timezone.now() - timedelta(hours=24)
|
||||
measurements = MetricMeasurement.objects.filter(
|
||||
metric=rule.metric,
|
||||
timestamp__gte=since
|
||||
).order_by('-timestamp')[:100] # Last 100 measurements
|
||||
|
||||
if len(measurements) < 10: # Need minimum data points
|
||||
return False
|
||||
|
||||
values = [float(m.value) for m in measurements]
|
||||
|
||||
# Calculate mean and standard deviation
|
||||
mean = sum(values) / len(values)
|
||||
variance = sum((x - mean) ** 2 for x in values) / len(values)
|
||||
std_dev = variance ** 0.5
|
||||
|
||||
# Check if latest value is an anomaly (more than 2 standard deviations)
|
||||
latest_value = values[0]
|
||||
anomaly_threshold = condition.get('threshold', 2.0) # Default 2 sigma
|
||||
|
||||
return abs(latest_value - mean) > (anomaly_threshold * std_dev)
|
||||
|
||||
def _evaluate_availability_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
|
||||
"""Evaluate availability-based alert conditions"""
|
||||
if not rule.target:
|
||||
return False
|
||||
|
||||
# Check if target is in critical state
|
||||
return rule.target.last_status == 'CRITICAL'
|
||||
|
||||
def _evaluate_pattern_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
|
||||
"""Evaluate pattern-based alert conditions"""
|
||||
# This would integrate with pattern detection algorithms
|
||||
# For now, return False as placeholder
|
||||
return False
|
||||
|
||||
def _create_alert(self, rule: AlertRule) -> Dict[str, Any]:
|
||||
"""Create an alert instance"""
|
||||
# Get current value for context
|
||||
current_value = None
|
||||
threshold_value = None
|
||||
|
||||
if rule.metric:
|
||||
latest_measurement = MetricMeasurement.objects.filter(
|
||||
metric=rule.metric
|
||||
).order_by('-timestamp').first()
|
||||
if latest_measurement:
|
||||
current_value = float(latest_measurement.value)
|
||||
threshold_value = rule.metric.critical_threshold
|
||||
|
||||
# Create alert
|
||||
alert = Alert.objects.create(
|
||||
rule=rule,
|
||||
title=f"{rule.name} - {rule.severity}",
|
||||
description=self._generate_alert_description(rule, current_value, threshold_value),
|
||||
severity=rule.severity,
|
||||
triggered_value=current_value,
|
||||
threshold_value=threshold_value,
|
||||
context_data={
|
||||
'rule_id': str(rule.id),
|
||||
'metric_name': rule.metric.name if rule.metric else None,
|
||||
'target_name': rule.target.name if rule.target else None,
|
||||
'condition': rule.condition
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
'alert_id': str(alert.id),
|
||||
'rule_name': rule.name,
|
||||
'severity': rule.severity,
|
||||
'title': alert.title,
|
||||
'description': alert.description,
|
||||
'current_value': current_value,
|
||||
'threshold_value': threshold_value
|
||||
}
|
||||
|
||||
def _generate_alert_description(self, rule: AlertRule, current_value: Optional[float], threshold_value: Optional[float]) -> str:
|
||||
"""Generate alert description"""
|
||||
description = f"Alert rule '{rule.name}' has been triggered.\n"
|
||||
|
||||
if rule.metric and current_value is not None:
|
||||
description += f"Current value: {current_value} {rule.metric.unit}\n"
|
||||
|
||||
if threshold_value is not None:
|
||||
description += f"Threshold: {threshold_value} {rule.metric.unit if rule.metric else ''}\n"
|
||||
|
||||
if rule.target:
|
||||
description += f"Target: {rule.target.name}\n"
|
||||
|
||||
description += f"Severity: {rule.severity}\n"
|
||||
description += f"Time: {timezone.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||
|
||||
return description
|
||||
|
||||
|
||||
class NotificationService:
|
||||
"""Service for sending alert notifications"""
|
||||
|
||||
def __init__(self):
|
||||
self.evaluator = AlertEvaluator()
|
||||
|
||||
def send_alert_notifications(self, alert_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send notifications for an alert"""
|
||||
results = {}
|
||||
|
||||
# Get alert rule to determine notification channels
|
||||
rule_id = alert_data.get('rule_id')
|
||||
if not rule_id:
|
||||
return {'error': 'No rule ID provided'}
|
||||
|
||||
try:
|
||||
rule = AlertRule.objects.get(id=rule_id)
|
||||
except AlertRule.DoesNotExist:
|
||||
return {'error': 'Alert rule not found'}
|
||||
|
||||
notification_channels = rule.notification_channels or []
|
||||
|
||||
for channel in notification_channels:
|
||||
try:
|
||||
if channel['type'] == 'EMAIL':
|
||||
result = self._send_email_notification(alert_data, channel)
|
||||
elif channel['type'] == 'SLACK':
|
||||
result = self._send_slack_notification(alert_data, channel)
|
||||
elif channel['type'] == 'WEBHOOK':
|
||||
result = self._send_webhook_notification(alert_data, channel)
|
||||
else:
|
||||
result = {'error': f'Unknown notification channel type: {channel["type"]}'}
|
||||
|
||||
results[channel['type']] = result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send {channel['type']} notification: {e}")
|
||||
results[channel['type']] = {'error': str(e)}
|
||||
|
||||
return results
|
||||
|
||||
def _send_email_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send email notification"""
|
||||
try:
|
||||
recipients = channel.get('recipients', [])
|
||||
if not recipients:
|
||||
return {'error': 'No email recipients configured'}
|
||||
|
||||
subject = f"[{alert_data.get('severity', 'ALERT')}] {alert_data.get('title', 'System Alert')}"
|
||||
message = alert_data.get('description', '')
|
||||
|
||||
send_mail(
|
||||
subject=subject,
|
||||
message=message,
|
||||
from_email=settings.DEFAULT_FROM_EMAIL,
|
||||
recipient_list=recipients,
|
||||
fail_silently=False
|
||||
)
|
||||
|
||||
return {'status': 'sent', 'recipients': recipients}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _send_slack_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send Slack notification"""
|
||||
try:
|
||||
webhook_url = channel.get('webhook_url')
|
||||
if not webhook_url:
|
||||
return {'error': 'No Slack webhook URL configured'}
|
||||
|
||||
# Create Slack message
|
||||
color = self._get_slack_color(alert_data.get('severity', 'MEDIUM'))
|
||||
|
||||
slack_message = {
|
||||
"text": alert_data.get('title', 'System Alert'),
|
||||
"attachments": [
|
||||
{
|
||||
"color": color,
|
||||
"fields": [
|
||||
{
|
||||
"title": "Description",
|
||||
"value": alert_data.get('description', ''),
|
||||
"short": False
|
||||
},
|
||||
{
|
||||
"title": "Severity",
|
||||
"value": alert_data.get('severity', 'UNKNOWN'),
|
||||
"short": True
|
||||
},
|
||||
{
|
||||
"title": "Time",
|
||||
"value": timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
"short": True
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Send to Slack (would use requests in real implementation)
|
||||
# requests.post(webhook_url, json=slack_message)
|
||||
|
||||
return {'status': 'sent', 'channel': channel.get('channel', '#alerts')}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _send_webhook_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send webhook notification"""
|
||||
try:
|
||||
webhook_url = channel.get('url')
|
||||
if not webhook_url:
|
||||
return {'error': 'No webhook URL configured'}
|
||||
|
||||
# Prepare webhook payload
|
||||
payload = {
|
||||
'alert': alert_data,
|
||||
'timestamp': timezone.now().isoformat(),
|
||||
'source': 'ETB-API-Monitoring'
|
||||
}
|
||||
|
||||
# Send webhook (would use requests in real implementation)
|
||||
# requests.post(webhook_url, json=payload)
|
||||
|
||||
return {'status': 'sent', 'url': webhook_url}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _get_slack_color(self, severity: str) -> str:
|
||||
"""Get Slack color based on severity"""
|
||||
color_map = {
|
||||
'LOW': 'good',
|
||||
'MEDIUM': 'warning',
|
||||
'HIGH': 'danger',
|
||||
'CRITICAL': 'danger'
|
||||
}
|
||||
return color_map.get(severity, 'warning')
|
||||
|
||||
|
||||
class AlertingService:
|
||||
"""Main alerting service that coordinates alert evaluation and notification"""
|
||||
|
||||
def __init__(self):
|
||||
self.evaluator = AlertEvaluator()
|
||||
self.notification_service = NotificationService()
|
||||
|
||||
def run_alert_evaluation(self) -> Dict[str, Any]:
|
||||
"""Run alert evaluation and send notifications"""
|
||||
results = {
|
||||
'evaluated_rules': 0,
|
||||
'triggered_alerts': 0,
|
||||
'notifications_sent': 0,
|
||||
'errors': []
|
||||
}
|
||||
|
||||
try:
|
||||
# Evaluate all alert rules
|
||||
triggered_alerts = self.evaluator.evaluate_alert_rules()
|
||||
results['triggered_alerts'] = len(triggered_alerts)
|
||||
|
||||
# Send notifications for triggered alerts
|
||||
for alert_data in triggered_alerts:
|
||||
try:
|
||||
notification_results = self.notification_service.send_alert_notifications(alert_data)
|
||||
results['notifications_sent'] += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send notifications for alert {alert_data.get('alert_id')}: {e}")
|
||||
results['errors'].append(str(e))
|
||||
|
||||
# Count evaluated rules
|
||||
results['evaluated_rules'] = AlertRule.objects.filter(
|
||||
status='ACTIVE',
|
||||
is_enabled=True
|
||||
).count()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Alert evaluation failed: {e}")
|
||||
results['errors'].append(str(e))
|
||||
|
||||
return results
|
||||
|
||||
def acknowledge_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
|
||||
"""Acknowledge an alert"""
|
||||
try:
|
||||
alert = Alert.objects.get(id=alert_id)
|
||||
alert.status = 'ACKNOWLEDGED'
|
||||
alert.acknowledged_by = user
|
||||
alert.acknowledged_at = timezone.now()
|
||||
alert.save()
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'message': f'Alert {alert_id} acknowledged by {user.username}'
|
||||
}
|
||||
|
||||
except Alert.DoesNotExist:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': f'Alert {alert_id} not found'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': str(e)
|
||||
}
|
||||
|
||||
def resolve_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
|
||||
"""Resolve an alert"""
|
||||
try:
|
||||
alert = Alert.objects.get(id=alert_id)
|
||||
alert.status = 'RESOLVED'
|
||||
alert.resolved_by = user
|
||||
alert.resolved_at = timezone.now()
|
||||
alert.save()
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'message': f'Alert {alert_id} resolved by {user.username}'
|
||||
}
|
||||
|
||||
except Alert.DoesNotExist:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': f'Alert {alert_id} not found'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': str(e)
|
||||
}
|
||||
|
||||
def get_active_alerts(self, severity: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""Get active alerts"""
|
||||
alerts = Alert.objects.filter(status='TRIGGERED')
|
||||
|
||||
if severity:
|
||||
alerts = alerts.filter(severity=severity)
|
||||
|
||||
return [
|
||||
{
|
||||
'id': str(alert.id),
|
||||
'title': alert.title,
|
||||
'description': alert.description,
|
||||
'severity': alert.severity,
|
||||
'triggered_at': alert.triggered_at,
|
||||
'rule_name': alert.rule.name,
|
||||
'current_value': float(alert.triggered_value) if alert.triggered_value else None,
|
||||
'threshold_value': float(alert.threshold_value) if alert.threshold_value else None
|
||||
}
|
||||
for alert in alerts.order_by('-triggered_at')
|
||||
]
|
||||
Reference in New Issue
Block a user