Updates
This commit is contained in:
1
ETB-API/monitoring/services/__init__.py
Normal file
1
ETB-API/monitoring/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Monitoring services
|
||||
BIN
ETB-API/monitoring/services/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
ETB-API/monitoring/services/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
ETB-API/monitoring/services/__pycache__/alerting.cpython-312.pyc
Normal file
BIN
ETB-API/monitoring/services/__pycache__/alerting.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
449
ETB-API/monitoring/services/alerting.py
Normal file
449
ETB-API/monitoring/services/alerting.py
Normal file
@@ -0,0 +1,449 @@
|
||||
"""
|
||||
Alerting service for monitoring system
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from django.utils import timezone
|
||||
from django.core.mail import send_mail
|
||||
from django.conf import settings
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
from monitoring.models import AlertRule, Alert, SystemMetric, MetricMeasurement, MonitoringTarget
|
||||
|
||||
User = get_user_model()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AlertEvaluator:
|
||||
"""Service for evaluating alert conditions"""
|
||||
|
||||
def __init__(self):
|
||||
self.aggregator = None # Will be imported to avoid circular imports
|
||||
|
||||
def evaluate_alert_rules(self) -> List[Dict[str, Any]]:
|
||||
"""Evaluate all active alert rules"""
|
||||
triggered_alerts = []
|
||||
|
||||
active_rules = AlertRule.objects.filter(
|
||||
status='ACTIVE',
|
||||
is_enabled=True
|
||||
)
|
||||
|
||||
for rule in active_rules:
|
||||
try:
|
||||
if self._evaluate_rule(rule):
|
||||
alert_data = self._create_alert(rule)
|
||||
triggered_alerts.append(alert_data)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to evaluate alert rule {rule.name}: {e}")
|
||||
|
||||
return triggered_alerts
|
||||
|
||||
def _evaluate_rule(self, rule: AlertRule) -> bool:
|
||||
"""Evaluate if an alert rule condition is met"""
|
||||
condition = rule.condition
|
||||
condition_type = condition.get('type')
|
||||
|
||||
if condition_type == 'THRESHOLD':
|
||||
return self._evaluate_threshold_condition(rule, condition)
|
||||
elif condition_type == 'ANOMALY':
|
||||
return self._evaluate_anomaly_condition(rule, condition)
|
||||
elif condition_type == 'AVAILABILITY':
|
||||
return self._evaluate_availability_condition(rule, condition)
|
||||
elif condition_type == 'PATTERN':
|
||||
return self._evaluate_pattern_condition(rule, condition)
|
||||
else:
|
||||
logger.warning(f"Unknown condition type: {condition_type}")
|
||||
return False
|
||||
|
||||
def _evaluate_threshold_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
|
||||
"""Evaluate threshold-based alert conditions"""
|
||||
if not rule.metric:
|
||||
return False
|
||||
|
||||
# Get latest metric value
|
||||
latest_measurement = MetricMeasurement.objects.filter(
|
||||
metric=rule.metric
|
||||
).order_by('-timestamp').first()
|
||||
|
||||
if not latest_measurement:
|
||||
return False
|
||||
|
||||
current_value = float(latest_measurement.value)
|
||||
threshold_value = condition.get('threshold')
|
||||
operator = condition.get('operator', '>')
|
||||
|
||||
if operator == '>':
|
||||
return current_value > threshold_value
|
||||
elif operator == '>=':
|
||||
return current_value >= threshold_value
|
||||
elif operator == '<':
|
||||
return current_value < threshold_value
|
||||
elif operator == '<=':
|
||||
return current_value <= threshold_value
|
||||
elif operator == '==':
|
||||
return current_value == threshold_value
|
||||
elif operator == '!=':
|
||||
return current_value != threshold_value
|
||||
else:
|
||||
logger.warning(f"Unknown operator: {operator}")
|
||||
return False
|
||||
|
||||
def _evaluate_anomaly_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
|
||||
"""Evaluate anomaly-based alert conditions"""
|
||||
# This would integrate with anomaly detection models
|
||||
# For now, implement a simple statistical anomaly detection
|
||||
|
||||
if not rule.metric:
|
||||
return False
|
||||
|
||||
# Get recent measurements
|
||||
since = timezone.now() - timedelta(hours=24)
|
||||
measurements = MetricMeasurement.objects.filter(
|
||||
metric=rule.metric,
|
||||
timestamp__gte=since
|
||||
).order_by('-timestamp')[:100] # Last 100 measurements
|
||||
|
||||
if len(measurements) < 10: # Need minimum data points
|
||||
return False
|
||||
|
||||
values = [float(m.value) for m in measurements]
|
||||
|
||||
# Calculate mean and standard deviation
|
||||
mean = sum(values) / len(values)
|
||||
variance = sum((x - mean) ** 2 for x in values) / len(values)
|
||||
std_dev = variance ** 0.5
|
||||
|
||||
# Check if latest value is an anomaly (more than 2 standard deviations)
|
||||
latest_value = values[0]
|
||||
anomaly_threshold = condition.get('threshold', 2.0) # Default 2 sigma
|
||||
|
||||
return abs(latest_value - mean) > (anomaly_threshold * std_dev)
|
||||
|
||||
def _evaluate_availability_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
|
||||
"""Evaluate availability-based alert conditions"""
|
||||
if not rule.target:
|
||||
return False
|
||||
|
||||
# Check if target is in critical state
|
||||
return rule.target.last_status == 'CRITICAL'
|
||||
|
||||
def _evaluate_pattern_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
|
||||
"""Evaluate pattern-based alert conditions"""
|
||||
# This would integrate with pattern detection algorithms
|
||||
# For now, return False as placeholder
|
||||
return False
|
||||
|
||||
def _create_alert(self, rule: AlertRule) -> Dict[str, Any]:
|
||||
"""Create an alert instance"""
|
||||
# Get current value for context
|
||||
current_value = None
|
||||
threshold_value = None
|
||||
|
||||
if rule.metric:
|
||||
latest_measurement = MetricMeasurement.objects.filter(
|
||||
metric=rule.metric
|
||||
).order_by('-timestamp').first()
|
||||
if latest_measurement:
|
||||
current_value = float(latest_measurement.value)
|
||||
threshold_value = rule.metric.critical_threshold
|
||||
|
||||
# Create alert
|
||||
alert = Alert.objects.create(
|
||||
rule=rule,
|
||||
title=f"{rule.name} - {rule.severity}",
|
||||
description=self._generate_alert_description(rule, current_value, threshold_value),
|
||||
severity=rule.severity,
|
||||
triggered_value=current_value,
|
||||
threshold_value=threshold_value,
|
||||
context_data={
|
||||
'rule_id': str(rule.id),
|
||||
'metric_name': rule.metric.name if rule.metric else None,
|
||||
'target_name': rule.target.name if rule.target else None,
|
||||
'condition': rule.condition
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
'alert_id': str(alert.id),
|
||||
'rule_name': rule.name,
|
||||
'severity': rule.severity,
|
||||
'title': alert.title,
|
||||
'description': alert.description,
|
||||
'current_value': current_value,
|
||||
'threshold_value': threshold_value
|
||||
}
|
||||
|
||||
def _generate_alert_description(self, rule: AlertRule, current_value: Optional[float], threshold_value: Optional[float]) -> str:
|
||||
"""Generate alert description"""
|
||||
description = f"Alert rule '{rule.name}' has been triggered.\n"
|
||||
|
||||
if rule.metric and current_value is not None:
|
||||
description += f"Current value: {current_value} {rule.metric.unit}\n"
|
||||
|
||||
if threshold_value is not None:
|
||||
description += f"Threshold: {threshold_value} {rule.metric.unit if rule.metric else ''}\n"
|
||||
|
||||
if rule.target:
|
||||
description += f"Target: {rule.target.name}\n"
|
||||
|
||||
description += f"Severity: {rule.severity}\n"
|
||||
description += f"Time: {timezone.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
||||
|
||||
return description
|
||||
|
||||
|
||||
class NotificationService:
|
||||
"""Service for sending alert notifications"""
|
||||
|
||||
def __init__(self):
|
||||
self.evaluator = AlertEvaluator()
|
||||
|
||||
def send_alert_notifications(self, alert_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send notifications for an alert"""
|
||||
results = {}
|
||||
|
||||
# Get alert rule to determine notification channels
|
||||
rule_id = alert_data.get('rule_id')
|
||||
if not rule_id:
|
||||
return {'error': 'No rule ID provided'}
|
||||
|
||||
try:
|
||||
rule = AlertRule.objects.get(id=rule_id)
|
||||
except AlertRule.DoesNotExist:
|
||||
return {'error': 'Alert rule not found'}
|
||||
|
||||
notification_channels = rule.notification_channels or []
|
||||
|
||||
for channel in notification_channels:
|
||||
try:
|
||||
if channel['type'] == 'EMAIL':
|
||||
result = self._send_email_notification(alert_data, channel)
|
||||
elif channel['type'] == 'SLACK':
|
||||
result = self._send_slack_notification(alert_data, channel)
|
||||
elif channel['type'] == 'WEBHOOK':
|
||||
result = self._send_webhook_notification(alert_data, channel)
|
||||
else:
|
||||
result = {'error': f'Unknown notification channel type: {channel["type"]}'}
|
||||
|
||||
results[channel['type']] = result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send {channel['type']} notification: {e}")
|
||||
results[channel['type']] = {'error': str(e)}
|
||||
|
||||
return results
|
||||
|
||||
def _send_email_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send email notification"""
|
||||
try:
|
||||
recipients = channel.get('recipients', [])
|
||||
if not recipients:
|
||||
return {'error': 'No email recipients configured'}
|
||||
|
||||
subject = f"[{alert_data.get('severity', 'ALERT')}] {alert_data.get('title', 'System Alert')}"
|
||||
message = alert_data.get('description', '')
|
||||
|
||||
send_mail(
|
||||
subject=subject,
|
||||
message=message,
|
||||
from_email=settings.DEFAULT_FROM_EMAIL,
|
||||
recipient_list=recipients,
|
||||
fail_silently=False
|
||||
)
|
||||
|
||||
return {'status': 'sent', 'recipients': recipients}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _send_slack_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send Slack notification"""
|
||||
try:
|
||||
webhook_url = channel.get('webhook_url')
|
||||
if not webhook_url:
|
||||
return {'error': 'No Slack webhook URL configured'}
|
||||
|
||||
# Create Slack message
|
||||
color = self._get_slack_color(alert_data.get('severity', 'MEDIUM'))
|
||||
|
||||
slack_message = {
|
||||
"text": alert_data.get('title', 'System Alert'),
|
||||
"attachments": [
|
||||
{
|
||||
"color": color,
|
||||
"fields": [
|
||||
{
|
||||
"title": "Description",
|
||||
"value": alert_data.get('description', ''),
|
||||
"short": False
|
||||
},
|
||||
{
|
||||
"title": "Severity",
|
||||
"value": alert_data.get('severity', 'UNKNOWN'),
|
||||
"short": True
|
||||
},
|
||||
{
|
||||
"title": "Time",
|
||||
"value": timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
"short": True
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Send to Slack (would use requests in real implementation)
|
||||
# requests.post(webhook_url, json=slack_message)
|
||||
|
||||
return {'status': 'sent', 'channel': channel.get('channel', '#alerts')}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _send_webhook_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Send webhook notification"""
|
||||
try:
|
||||
webhook_url = channel.get('url')
|
||||
if not webhook_url:
|
||||
return {'error': 'No webhook URL configured'}
|
||||
|
||||
# Prepare webhook payload
|
||||
payload = {
|
||||
'alert': alert_data,
|
||||
'timestamp': timezone.now().isoformat(),
|
||||
'source': 'ETB-API-Monitoring'
|
||||
}
|
||||
|
||||
# Send webhook (would use requests in real implementation)
|
||||
# requests.post(webhook_url, json=payload)
|
||||
|
||||
return {'status': 'sent', 'url': webhook_url}
|
||||
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
|
||||
def _get_slack_color(self, severity: str) -> str:
|
||||
"""Get Slack color based on severity"""
|
||||
color_map = {
|
||||
'LOW': 'good',
|
||||
'MEDIUM': 'warning',
|
||||
'HIGH': 'danger',
|
||||
'CRITICAL': 'danger'
|
||||
}
|
||||
return color_map.get(severity, 'warning')
|
||||
|
||||
|
||||
class AlertingService:
|
||||
"""Main alerting service that coordinates alert evaluation and notification"""
|
||||
|
||||
def __init__(self):
|
||||
self.evaluator = AlertEvaluator()
|
||||
self.notification_service = NotificationService()
|
||||
|
||||
def run_alert_evaluation(self) -> Dict[str, Any]:
|
||||
"""Run alert evaluation and send notifications"""
|
||||
results = {
|
||||
'evaluated_rules': 0,
|
||||
'triggered_alerts': 0,
|
||||
'notifications_sent': 0,
|
||||
'errors': []
|
||||
}
|
||||
|
||||
try:
|
||||
# Evaluate all alert rules
|
||||
triggered_alerts = self.evaluator.evaluate_alert_rules()
|
||||
results['triggered_alerts'] = len(triggered_alerts)
|
||||
|
||||
# Send notifications for triggered alerts
|
||||
for alert_data in triggered_alerts:
|
||||
try:
|
||||
notification_results = self.notification_service.send_alert_notifications(alert_data)
|
||||
results['notifications_sent'] += 1
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to send notifications for alert {alert_data.get('alert_id')}: {e}")
|
||||
results['errors'].append(str(e))
|
||||
|
||||
# Count evaluated rules
|
||||
results['evaluated_rules'] = AlertRule.objects.filter(
|
||||
status='ACTIVE',
|
||||
is_enabled=True
|
||||
).count()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Alert evaluation failed: {e}")
|
||||
results['errors'].append(str(e))
|
||||
|
||||
return results
|
||||
|
||||
def acknowledge_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
|
||||
"""Acknowledge an alert"""
|
||||
try:
|
||||
alert = Alert.objects.get(id=alert_id)
|
||||
alert.status = 'ACKNOWLEDGED'
|
||||
alert.acknowledged_by = user
|
||||
alert.acknowledged_at = timezone.now()
|
||||
alert.save()
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'message': f'Alert {alert_id} acknowledged by {user.username}'
|
||||
}
|
||||
|
||||
except Alert.DoesNotExist:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': f'Alert {alert_id} not found'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': str(e)
|
||||
}
|
||||
|
||||
def resolve_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
|
||||
"""Resolve an alert"""
|
||||
try:
|
||||
alert = Alert.objects.get(id=alert_id)
|
||||
alert.status = 'RESOLVED'
|
||||
alert.resolved_by = user
|
||||
alert.resolved_at = timezone.now()
|
||||
alert.save()
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'message': f'Alert {alert_id} resolved by {user.username}'
|
||||
}
|
||||
|
||||
except Alert.DoesNotExist:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': f'Alert {alert_id} not found'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'status': 'error',
|
||||
'message': str(e)
|
||||
}
|
||||
|
||||
def get_active_alerts(self, severity: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""Get active alerts"""
|
||||
alerts = Alert.objects.filter(status='TRIGGERED')
|
||||
|
||||
if severity:
|
||||
alerts = alerts.filter(severity=severity)
|
||||
|
||||
return [
|
||||
{
|
||||
'id': str(alert.id),
|
||||
'title': alert.title,
|
||||
'description': alert.description,
|
||||
'severity': alert.severity,
|
||||
'triggered_at': alert.triggered_at,
|
||||
'rule_name': alert.rule.name,
|
||||
'current_value': float(alert.triggered_value) if alert.triggered_value else None,
|
||||
'threshold_value': float(alert.threshold_value) if alert.threshold_value else None
|
||||
}
|
||||
for alert in alerts.order_by('-triggered_at')
|
||||
]
|
||||
372
ETB-API/monitoring/services/health_checks.py
Normal file
372
ETB-API/monitoring/services/health_checks.py
Normal file
@@ -0,0 +1,372 @@
|
||||
"""
|
||||
Health check services for monitoring system components
|
||||
"""
|
||||
import time
|
||||
import requests
|
||||
import psutil
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, Tuple
|
||||
from django.conf import settings
|
||||
from django.db import connection
|
||||
from django.core.cache import cache
|
||||
from django.utils import timezone
|
||||
from celery import current_app as celery_app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseHealthCheck:
|
||||
"""Base class for health checks"""
|
||||
|
||||
def __init__(self, target):
|
||||
self.target = target
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
|
||||
def execute(self) -> Dict[str, Any]:
|
||||
"""Execute the health check and return results"""
|
||||
self.start_time = time.time()
|
||||
try:
|
||||
result = self._perform_check()
|
||||
self.end_time = time.time()
|
||||
|
||||
result.update({
|
||||
'response_time_ms': int((self.end_time - self.start_time) * 1000),
|
||||
'checked_at': timezone.now(),
|
||||
'error_message': None
|
||||
})
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
self.end_time = time.time()
|
||||
logger.error(f"Health check failed for {self.target.name}: {e}")
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'response_time_ms': int((self.end_time - self.start_time) * 1000),
|
||||
'checked_at': timezone.now(),
|
||||
'error_message': str(e)
|
||||
}
|
||||
|
||||
def _perform_check(self) -> Dict[str, Any]:
|
||||
"""Override in subclasses to implement specific checks"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class HTTPHealthCheck(BaseHealthCheck):
|
||||
"""HTTP-based health check"""
|
||||
|
||||
def _perform_check(self) -> Dict[str, Any]:
|
||||
url = self.target.endpoint_url
|
||||
if not url:
|
||||
raise ValueError("No endpoint URL configured")
|
||||
|
||||
timeout = self.target.timeout_seconds
|
||||
expected_codes = self.target.expected_status_codes or [200]
|
||||
|
||||
response = requests.get(url, timeout=timeout)
|
||||
|
||||
if response.status_code in expected_codes:
|
||||
status = 'HEALTHY'
|
||||
elif response.status_code >= 500:
|
||||
status = 'CRITICAL'
|
||||
else:
|
||||
status = 'WARNING'
|
||||
|
||||
return {
|
||||
'status': status,
|
||||
'status_code': response.status_code,
|
||||
'response_body': response.text[:1000] # Limit response body size
|
||||
}
|
||||
|
||||
|
||||
class DatabaseHealthCheck(BaseHealthCheck):
|
||||
"""Database connection health check"""
|
||||
|
||||
def _perform_check(self) -> Dict[str, Any]:
|
||||
try:
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute("SELECT 1")
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result and result[0] == 1:
|
||||
return {
|
||||
'status': 'HEALTHY',
|
||||
'status_code': 200
|
||||
}
|
||||
else:
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'status_code': 500,
|
||||
'error_message': 'Database query returned unexpected result'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'status_code': 500,
|
||||
'error_message': f'Database connection failed: {str(e)}'
|
||||
}
|
||||
|
||||
|
||||
class CacheHealthCheck(BaseHealthCheck):
|
||||
"""Cache system health check"""
|
||||
|
||||
def _perform_check(self) -> Dict[str, Any]:
|
||||
try:
|
||||
# Test cache write/read
|
||||
test_key = f"health_check_{int(time.time())}"
|
||||
test_value = "health_check_value"
|
||||
|
||||
cache.set(test_key, test_value, timeout=10)
|
||||
retrieved_value = cache.get(test_key)
|
||||
|
||||
if retrieved_value == test_value:
|
||||
cache.delete(test_key) # Clean up
|
||||
return {
|
||||
'status': 'HEALTHY',
|
||||
'status_code': 200
|
||||
}
|
||||
else:
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'status_code': 500,
|
||||
'error_message': 'Cache read/write test failed'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'status_code': 500,
|
||||
'error_message': f'Cache operation failed: {str(e)}'
|
||||
}
|
||||
|
||||
|
||||
class CeleryHealthCheck(BaseHealthCheck):
|
||||
"""Celery worker health check"""
|
||||
|
||||
def _perform_check(self) -> Dict[str, Any]:
|
||||
try:
|
||||
# Check if Celery workers are active
|
||||
inspect = celery_app.control.inspect()
|
||||
active_workers = inspect.active()
|
||||
|
||||
if active_workers:
|
||||
worker_count = len(active_workers)
|
||||
return {
|
||||
'status': 'HEALTHY',
|
||||
'status_code': 200,
|
||||
'response_body': f'Active workers: {worker_count}'
|
||||
}
|
||||
else:
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'status_code': 500,
|
||||
'error_message': 'No active Celery workers found'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'status_code': 500,
|
||||
'error_message': f'Celery health check failed: {str(e)}'
|
||||
}
|
||||
|
||||
|
||||
class SystemResourceHealthCheck(BaseHealthCheck):
|
||||
"""System resource health check"""
|
||||
|
||||
def _perform_check(self) -> Dict[str, Any]:
|
||||
try:
|
||||
# Get system metrics
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
memory = psutil.virtual_memory()
|
||||
disk = psutil.disk_usage('/')
|
||||
|
||||
# Determine status based on thresholds
|
||||
status = 'HEALTHY'
|
||||
if cpu_percent > 90 or memory.percent > 90 or disk.percent > 90:
|
||||
status = 'CRITICAL'
|
||||
elif cpu_percent > 80 or memory.percent > 80 or disk.percent > 80:
|
||||
status = 'WARNING'
|
||||
|
||||
return {
|
||||
'status': status,
|
||||
'status_code': 200,
|
||||
'cpu_usage_percent': cpu_percent,
|
||||
'memory_usage_percent': memory.percent,
|
||||
'disk_usage_percent': disk.percent,
|
||||
'response_body': f'CPU: {cpu_percent}%, Memory: {memory.percent}%, Disk: {disk.percent}%'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'status_code': 500,
|
||||
'error_message': f'System resource check failed: {str(e)}'
|
||||
}
|
||||
|
||||
|
||||
class ModuleHealthCheck(BaseHealthCheck):
|
||||
"""Django module health check"""
|
||||
|
||||
def _perform_check(self) -> Dict[str, Any]:
|
||||
try:
|
||||
module_name = self.target.related_module
|
||||
if not module_name:
|
||||
raise ValueError("No module specified for module health check")
|
||||
|
||||
# Import the module to check if it's accessible
|
||||
__import__(module_name)
|
||||
|
||||
# Check if module has required models/views
|
||||
from django.apps import apps
|
||||
app_config = apps.get_app_config(module_name)
|
||||
|
||||
if app_config:
|
||||
return {
|
||||
'status': 'HEALTHY',
|
||||
'status_code': 200,
|
||||
'response_body': f'Module {module_name} is accessible'
|
||||
}
|
||||
else:
|
||||
return {
|
||||
'status': 'WARNING',
|
||||
'status_code': 200,
|
||||
'error_message': f'Module {module_name} not found in Django apps'
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'status': 'CRITICAL',
|
||||
'status_code': 500,
|
||||
'error_message': f'Module health check failed: {str(e)}'
|
||||
}
|
||||
|
||||
|
||||
class HealthCheckFactory:
|
||||
"""Factory for creating health check instances"""
|
||||
|
||||
CHECK_CLASSES = {
|
||||
'HTTP': HTTPHealthCheck,
|
||||
'DATABASE': DatabaseHealthCheck,
|
||||
'CACHE': CacheHealthCheck,
|
||||
'QUEUE': CeleryHealthCheck,
|
||||
'CUSTOM': BaseHealthCheck,
|
||||
'PING': HTTPHealthCheck, # Use HTTP for ping
|
||||
'SSL': HTTPHealthCheck, # Use HTTP for SSL
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def create_health_check(cls, target, check_type: str) -> BaseHealthCheck:
|
||||
"""Create a health check instance based on type"""
|
||||
check_class = cls.CHECK_CLASSES.get(check_type, BaseHealthCheck)
|
||||
return check_class(target)
|
||||
|
||||
@classmethod
|
||||
def get_available_check_types(cls) -> list:
|
||||
"""Get list of available health check types"""
|
||||
return list(cls.CHECK_CLASSES.keys())
|
||||
|
||||
|
||||
class HealthCheckService:
|
||||
"""Service for managing health checks"""
|
||||
|
||||
def __init__(self):
|
||||
self.factory = HealthCheckFactory()
|
||||
|
||||
def execute_health_check(self, target, check_type: str) -> Dict[str, Any]:
|
||||
"""Execute a health check for a target"""
|
||||
health_check = self.factory.create_health_check(target, check_type)
|
||||
return health_check.execute()
|
||||
|
||||
def execute_all_health_checks(self) -> Dict[str, Any]:
|
||||
"""Execute health checks for all active targets"""
|
||||
from monitoring.models import MonitoringTarget, HealthCheck
|
||||
|
||||
results = {}
|
||||
active_targets = MonitoringTarget.objects.filter(
|
||||
status='ACTIVE',
|
||||
health_check_enabled=True
|
||||
)
|
||||
|
||||
for target in active_targets:
|
||||
try:
|
||||
# Determine check type based on target type
|
||||
check_type = self._get_check_type_for_target(target)
|
||||
|
||||
# Execute health check
|
||||
result = self.execute_health_check(target, check_type)
|
||||
|
||||
# Save result to database
|
||||
HealthCheck.objects.create(
|
||||
target=target,
|
||||
check_type=check_type,
|
||||
status=result['status'],
|
||||
response_time_ms=result.get('response_time_ms'),
|
||||
status_code=result.get('status_code'),
|
||||
response_body=result.get('response_body'),
|
||||
error_message=result.get('error_message'),
|
||||
cpu_usage_percent=result.get('cpu_usage_percent'),
|
||||
memory_usage_percent=result.get('memory_usage_percent'),
|
||||
disk_usage_percent=result.get('disk_usage_percent')
|
||||
)
|
||||
|
||||
# Update target status
|
||||
target.last_checked = timezone.now()
|
||||
target.last_status = result['status']
|
||||
target.save(update_fields=['last_checked', 'last_status'])
|
||||
|
||||
results[target.name] = result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to execute health check for {target.name}: {e}")
|
||||
results[target.name] = {
|
||||
'status': 'CRITICAL',
|
||||
'error_message': str(e)
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
def _get_check_type_for_target(self, target) -> str:
|
||||
"""Determine the appropriate check type for a target"""
|
||||
target_type_mapping = {
|
||||
'APPLICATION': 'HTTP',
|
||||
'DATABASE': 'DATABASE',
|
||||
'CACHE': 'CACHE',
|
||||
'QUEUE': 'QUEUE',
|
||||
'EXTERNAL_API': 'HTTP',
|
||||
'SERVICE': 'HTTP',
|
||||
'INFRASTRUCTURE': 'HTTP',
|
||||
'MODULE': 'CUSTOM',
|
||||
}
|
||||
|
||||
return target_type_mapping.get(target.target_type, 'HTTP')
|
||||
|
||||
def get_system_health_summary(self) -> Dict[str, Any]:
|
||||
"""Get overall system health summary"""
|
||||
from monitoring.models import HealthCheck, MonitoringTarget
|
||||
|
||||
# Get latest health check for each target
|
||||
latest_checks = HealthCheck.objects.filter(
|
||||
target__status='ACTIVE'
|
||||
).order_by('target', '-checked_at').distinct('target')
|
||||
|
||||
total_targets = MonitoringTarget.objects.filter(status='ACTIVE').count()
|
||||
healthy_targets = latest_checks.filter(status='HEALTHY').count()
|
||||
warning_targets = latest_checks.filter(status='WARNING').count()
|
||||
critical_targets = latest_checks.filter(status='CRITICAL').count()
|
||||
|
||||
# Calculate overall status
|
||||
if critical_targets > 0:
|
||||
overall_status = 'CRITICAL'
|
||||
elif warning_targets > 0:
|
||||
overall_status = 'WARNING'
|
||||
elif healthy_targets == total_targets:
|
||||
overall_status = 'HEALTHY'
|
||||
else:
|
||||
overall_status = 'UNKNOWN'
|
||||
|
||||
return {
|
||||
'overall_status': overall_status,
|
||||
'total_targets': total_targets,
|
||||
'healthy_targets': healthy_targets,
|
||||
'warning_targets': warning_targets,
|
||||
'critical_targets': critical_targets,
|
||||
'health_percentage': (healthy_targets / total_targets * 100) if total_targets > 0 else 0,
|
||||
'last_updated': timezone.now()
|
||||
}
|
||||
364
ETB-API/monitoring/services/metrics_collector.py
Normal file
364
ETB-API/monitoring/services/metrics_collector.py
Normal file
@@ -0,0 +1,364 @@
|
||||
"""
|
||||
Metrics collection service for system monitoring
|
||||
"""
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from django.utils import timezone
|
||||
from django.db import connection
|
||||
from django.core.cache import cache
|
||||
from django.conf import settings
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
from monitoring.models import SystemMetric, MetricMeasurement
|
||||
|
||||
User = get_user_model()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MetricsCollector:
|
||||
"""Service for collecting and storing system metrics"""
|
||||
|
||||
def __init__(self):
|
||||
self.collected_metrics = {}
|
||||
|
||||
def collect_all_metrics(self) -> Dict[str, Any]:
|
||||
"""Collect all configured metrics"""
|
||||
results = {}
|
||||
|
||||
# Get all active metrics
|
||||
active_metrics = SystemMetric.objects.filter(is_active=True)
|
||||
|
||||
for metric in active_metrics:
|
||||
try:
|
||||
value = self._collect_metric_value(metric)
|
||||
if value is not None:
|
||||
# Store measurement
|
||||
measurement = MetricMeasurement.objects.create(
|
||||
metric=metric,
|
||||
value=value,
|
||||
tags=self._get_metric_tags(metric),
|
||||
metadata=self._get_metric_metadata(metric)
|
||||
)
|
||||
|
||||
results[metric.name] = {
|
||||
'value': value,
|
||||
'measurement_id': measurement.id,
|
||||
'timestamp': measurement.timestamp
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to collect metric {metric.name}: {e}")
|
||||
results[metric.name] = {
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
def _collect_metric_value(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect value for a specific metric"""
|
||||
category = metric.category
|
||||
|
||||
if category == 'API_RESPONSE_TIME':
|
||||
return self._collect_api_response_time(metric)
|
||||
elif category == 'THROUGHPUT':
|
||||
return self._collect_throughput(metric)
|
||||
elif category == 'ERROR_RATE':
|
||||
return self._collect_error_rate(metric)
|
||||
elif category == 'AVAILABILITY':
|
||||
return self._collect_availability(metric)
|
||||
elif category == 'INCIDENT_COUNT':
|
||||
return self._collect_incident_count(metric)
|
||||
elif category == 'MTTR':
|
||||
return self._collect_mttr(metric)
|
||||
elif category == 'MTTA':
|
||||
return self._collect_mtta(metric)
|
||||
elif category == 'SLA_COMPLIANCE':
|
||||
return self._collect_sla_compliance(metric)
|
||||
elif category == 'SECURITY_EVENTS':
|
||||
return self._collect_security_events(metric)
|
||||
elif category == 'AUTOMATION_SUCCESS':
|
||||
return self._collect_automation_success(metric)
|
||||
elif category == 'AI_ACCURACY':
|
||||
return self._collect_ai_accuracy(metric)
|
||||
elif category == 'COST_IMPACT':
|
||||
return self._collect_cost_impact(metric)
|
||||
elif category == 'USER_ACTIVITY':
|
||||
return self._collect_user_activity(metric)
|
||||
elif category == 'SYSTEM_RESOURCES':
|
||||
return self._collect_system_resources(metric)
|
||||
else:
|
||||
logger.warning(f"Unknown metric category: {category}")
|
||||
return None
|
||||
|
||||
def _collect_api_response_time(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect API response time metrics"""
|
||||
# This would typically come from middleware or APM tools
|
||||
# For now, return a mock value
|
||||
return 150.5 # milliseconds
|
||||
|
||||
def _collect_throughput(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect throughput metrics (requests per minute)"""
|
||||
# Count requests in the last minute
|
||||
# This would typically come from access logs or middleware
|
||||
return 120.0 # requests per minute
|
||||
|
||||
def _collect_error_rate(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect error rate metrics"""
|
||||
# Count errors in the last hour
|
||||
# This would typically come from logs or error tracking
|
||||
return 0.02 # 2% error rate
|
||||
|
||||
def _collect_availability(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect availability metrics"""
|
||||
# Calculate availability percentage
|
||||
# This would typically come from uptime monitoring
|
||||
return 99.9 # 99.9% availability
|
||||
|
||||
def _collect_incident_count(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect incident count metrics"""
|
||||
from incident_intelligence.models import Incident
|
||||
|
||||
# Count incidents in the last 24 hours
|
||||
since = timezone.now() - timedelta(hours=24)
|
||||
count = Incident.objects.filter(created_at__gte=since).count()
|
||||
return float(count)
|
||||
|
||||
def _collect_mttr(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect Mean Time to Resolve metrics"""
|
||||
from incident_intelligence.models import Incident
|
||||
|
||||
# Calculate MTTR for resolved incidents in the last 7 days
|
||||
since = timezone.now() - timedelta(days=7)
|
||||
resolved_incidents = Incident.objects.filter(
|
||||
status__in=['RESOLVED', 'CLOSED'],
|
||||
resolved_at__isnull=False,
|
||||
resolved_at__gte=since
|
||||
)
|
||||
|
||||
if not resolved_incidents.exists():
|
||||
return None
|
||||
|
||||
total_resolution_time = 0
|
||||
count = 0
|
||||
|
||||
for incident in resolved_incidents:
|
||||
if incident.resolved_at and incident.created_at:
|
||||
resolution_time = incident.resolved_at - incident.created_at
|
||||
total_resolution_time += resolution_time.total_seconds()
|
||||
count += 1
|
||||
|
||||
if count > 0:
|
||||
return total_resolution_time / count / 60 # Convert to minutes
|
||||
return None
|
||||
|
||||
def _collect_mtta(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect Mean Time to Acknowledge metrics"""
|
||||
# This would require tracking when incidents are first acknowledged
|
||||
# For now, return a mock value
|
||||
return 15.5 # minutes
|
||||
|
||||
def _collect_sla_compliance(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect SLA compliance metrics"""
|
||||
from sla_oncall.models import SLAInstance
|
||||
|
||||
# Calculate SLA compliance percentage
|
||||
total_slas = SLAInstance.objects.count()
|
||||
if total_slas == 0:
|
||||
return None
|
||||
|
||||
# This would require more complex SLA compliance calculation
|
||||
# For now, return a mock value
|
||||
return 95.5 # 95.5% SLA compliance
|
||||
|
||||
def _collect_security_events(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect security events metrics"""
|
||||
# Count security events in the last hour
|
||||
# This would come from security logs or audit trails
|
||||
return 3.0 # 3 security events in the last hour
|
||||
|
||||
def _collect_automation_success(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect automation success rate metrics"""
|
||||
from automation_orchestration.models import RunbookExecution
|
||||
|
||||
# Calculate success rate for runbook executions in the last 24 hours
|
||||
since = timezone.now() - timedelta(hours=24)
|
||||
executions = RunbookExecution.objects.filter(created_at__gte=since)
|
||||
|
||||
if not executions.exists():
|
||||
return None
|
||||
|
||||
successful = executions.filter(status='COMPLETED').count()
|
||||
total = executions.count()
|
||||
|
||||
return (successful / total * 100) if total > 0 else None
|
||||
|
||||
def _collect_ai_accuracy(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect AI model accuracy metrics"""
|
||||
from incident_intelligence.models import IncidentClassification
|
||||
|
||||
# Calculate accuracy for AI classifications
|
||||
classifications = IncidentClassification.objects.all()
|
||||
|
||||
if not classifications.exists():
|
||||
return None
|
||||
|
||||
# This would require comparing predictions with actual outcomes
|
||||
# For now, return average confidence score
|
||||
total_confidence = sum(c.confidence_score for c in classifications)
|
||||
return (total_confidence / classifications.count() * 100) if classifications.count() > 0 else None
|
||||
|
||||
def _collect_cost_impact(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect cost impact metrics"""
|
||||
from analytics_predictive_insights.models import CostImpactAnalysis
|
||||
|
||||
# Calculate total cost impact for the last 30 days
|
||||
since = timezone.now() - timedelta(days=30)
|
||||
cost_analyses = CostImpactAnalysis.objects.filter(created_at__gte=since)
|
||||
|
||||
total_cost = sum(float(ca.cost_amount) for ca in cost_analyses)
|
||||
return total_cost
|
||||
|
||||
def _collect_user_activity(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect user activity metrics"""
|
||||
# Count active users in the last hour
|
||||
since = timezone.now() - timedelta(hours=1)
|
||||
# This would require user activity tracking
|
||||
return 25.0 # 25 active users in the last hour
|
||||
|
||||
def _collect_system_resources(self, metric: SystemMetric) -> Optional[float]:
|
||||
"""Collect system resource metrics"""
|
||||
import psutil
|
||||
|
||||
# Get CPU usage
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
return cpu_percent
|
||||
|
||||
def _get_metric_tags(self, metric: SystemMetric) -> Dict[str, str]:
|
||||
"""Get tags for a metric measurement"""
|
||||
tags = {
|
||||
'metric_type': metric.metric_type,
|
||||
'category': metric.category,
|
||||
}
|
||||
|
||||
if metric.related_module:
|
||||
tags['module'] = metric.related_module
|
||||
|
||||
return tags
|
||||
|
||||
def _get_metric_metadata(self, metric: SystemMetric) -> Dict[str, Any]:
|
||||
"""Get metadata for a metric measurement"""
|
||||
return {
|
||||
'unit': metric.unit,
|
||||
'aggregation_method': metric.aggregation_method,
|
||||
'collection_interval': metric.collection_interval_seconds,
|
||||
}
|
||||
|
||||
|
||||
class MetricsAggregator:
|
||||
"""Service for aggregating metrics over time periods"""
|
||||
|
||||
def __init__(self):
|
||||
self.collector = MetricsCollector()
|
||||
|
||||
def aggregate_metrics(self, metric: SystemMetric, start_time: datetime, end_time: datetime) -> Dict[str, Any]:
|
||||
"""Aggregate metrics over a time period"""
|
||||
measurements = MetricMeasurement.objects.filter(
|
||||
metric=metric,
|
||||
timestamp__gte=start_time,
|
||||
timestamp__lte=end_time
|
||||
).order_by('timestamp')
|
||||
|
||||
if not measurements.exists():
|
||||
return {
|
||||
'count': 0,
|
||||
'values': [],
|
||||
'aggregated_value': None
|
||||
}
|
||||
|
||||
values = [float(m.value) for m in measurements]
|
||||
aggregated_value = self._aggregate_values(values, metric.aggregation_method)
|
||||
|
||||
return {
|
||||
'count': len(values),
|
||||
'values': values,
|
||||
'aggregated_value': aggregated_value,
|
||||
'start_time': start_time,
|
||||
'end_time': end_time,
|
||||
'unit': metric.unit
|
||||
}
|
||||
|
||||
def _aggregate_values(self, values: List[float], method: str) -> Optional[float]:
|
||||
"""Aggregate a list of values using the specified method"""
|
||||
if not values:
|
||||
return None
|
||||
|
||||
if method == 'AVERAGE':
|
||||
return sum(values) / len(values)
|
||||
elif method == 'SUM':
|
||||
return sum(values)
|
||||
elif method == 'COUNT':
|
||||
return len(values)
|
||||
elif method == 'MIN':
|
||||
return min(values)
|
||||
elif method == 'MAX':
|
||||
return max(values)
|
||||
elif method == 'PERCENTILE_95':
|
||||
return self._calculate_percentile(values, 95)
|
||||
elif method == 'PERCENTILE_99':
|
||||
return self._calculate_percentile(values, 99)
|
||||
else:
|
||||
return sum(values) / len(values) # Default to average
|
||||
|
||||
def _calculate_percentile(self, values: List[float], percentile: int) -> float:
|
||||
"""Calculate percentile of values"""
|
||||
sorted_values = sorted(values)
|
||||
index = int((percentile / 100) * len(sorted_values))
|
||||
return sorted_values[min(index, len(sorted_values) - 1)]
|
||||
|
||||
def get_metric_trends(self, metric: SystemMetric, days: int = 7) -> Dict[str, Any]:
|
||||
"""Get metric trends over a period"""
|
||||
end_time = timezone.now()
|
||||
start_time = end_time - timedelta(days=days)
|
||||
|
||||
# Get daily aggregations
|
||||
daily_data = []
|
||||
for i in range(days):
|
||||
day_start = start_time + timedelta(days=i)
|
||||
day_end = day_start + timedelta(days=1)
|
||||
|
||||
day_aggregation = self.aggregate_metrics(metric, day_start, day_end)
|
||||
daily_data.append({
|
||||
'date': day_start.date(),
|
||||
'value': day_aggregation['aggregated_value'],
|
||||
'count': day_aggregation['count']
|
||||
})
|
||||
|
||||
return {
|
||||
'metric_name': metric.name,
|
||||
'period_days': days,
|
||||
'daily_data': daily_data,
|
||||
'trend': self._calculate_trend([d['value'] for d in daily_data if d['value'] is not None])
|
||||
}
|
||||
|
||||
def _calculate_trend(self, values: List[float]) -> str:
|
||||
"""Calculate trend direction from values"""
|
||||
if len(values) < 2:
|
||||
return 'STABLE'
|
||||
|
||||
# Simple linear trend calculation
|
||||
first_half = values[:len(values)//2]
|
||||
second_half = values[len(values)//2:]
|
||||
|
||||
first_avg = sum(first_half) / len(first_half)
|
||||
second_avg = sum(second_half) / len(second_half)
|
||||
|
||||
change_percent = ((second_avg - first_avg) / first_avg) * 100 if first_avg != 0 else 0
|
||||
|
||||
if change_percent > 5:
|
||||
return 'INCREASING'
|
||||
elif change_percent < -5:
|
||||
return 'DECREASING'
|
||||
else:
|
||||
return 'STABLE'
|
||||
Reference in New Issue
Block a user