This commit is contained in:
Iliyan Angelov
2025-09-19 11:58:53 +03:00
parent 306b20e24a
commit 6b247e5b9f
11423 changed files with 1500615 additions and 778 deletions

View File

@@ -0,0 +1 @@
# Monitoring services

View File

@@ -0,0 +1,449 @@
"""
Alerting service for monitoring system
"""
import logging
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from django.utils import timezone
from django.core.mail import send_mail
from django.conf import settings
from django.contrib.auth import get_user_model
from monitoring.models import AlertRule, Alert, SystemMetric, MetricMeasurement, MonitoringTarget
User = get_user_model()
logger = logging.getLogger(__name__)
class AlertEvaluator:
"""Service for evaluating alert conditions"""
def __init__(self):
self.aggregator = None # Will be imported to avoid circular imports
def evaluate_alert_rules(self) -> List[Dict[str, Any]]:
"""Evaluate all active alert rules"""
triggered_alerts = []
active_rules = AlertRule.objects.filter(
status='ACTIVE',
is_enabled=True
)
for rule in active_rules:
try:
if self._evaluate_rule(rule):
alert_data = self._create_alert(rule)
triggered_alerts.append(alert_data)
except Exception as e:
logger.error(f"Failed to evaluate alert rule {rule.name}: {e}")
return triggered_alerts
def _evaluate_rule(self, rule: AlertRule) -> bool:
"""Evaluate if an alert rule condition is met"""
condition = rule.condition
condition_type = condition.get('type')
if condition_type == 'THRESHOLD':
return self._evaluate_threshold_condition(rule, condition)
elif condition_type == 'ANOMALY':
return self._evaluate_anomaly_condition(rule, condition)
elif condition_type == 'AVAILABILITY':
return self._evaluate_availability_condition(rule, condition)
elif condition_type == 'PATTERN':
return self._evaluate_pattern_condition(rule, condition)
else:
logger.warning(f"Unknown condition type: {condition_type}")
return False
def _evaluate_threshold_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
"""Evaluate threshold-based alert conditions"""
if not rule.metric:
return False
# Get latest metric value
latest_measurement = MetricMeasurement.objects.filter(
metric=rule.metric
).order_by('-timestamp').first()
if not latest_measurement:
return False
current_value = float(latest_measurement.value)
threshold_value = condition.get('threshold')
operator = condition.get('operator', '>')
if operator == '>':
return current_value > threshold_value
elif operator == '>=':
return current_value >= threshold_value
elif operator == '<':
return current_value < threshold_value
elif operator == '<=':
return current_value <= threshold_value
elif operator == '==':
return current_value == threshold_value
elif operator == '!=':
return current_value != threshold_value
else:
logger.warning(f"Unknown operator: {operator}")
return False
def _evaluate_anomaly_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
"""Evaluate anomaly-based alert conditions"""
# This would integrate with anomaly detection models
# For now, implement a simple statistical anomaly detection
if not rule.metric:
return False
# Get recent measurements
since = timezone.now() - timedelta(hours=24)
measurements = MetricMeasurement.objects.filter(
metric=rule.metric,
timestamp__gte=since
).order_by('-timestamp')[:100] # Last 100 measurements
if len(measurements) < 10: # Need minimum data points
return False
values = [float(m.value) for m in measurements]
# Calculate mean and standard deviation
mean = sum(values) / len(values)
variance = sum((x - mean) ** 2 for x in values) / len(values)
std_dev = variance ** 0.5
# Check if latest value is an anomaly (more than 2 standard deviations)
latest_value = values[0]
anomaly_threshold = condition.get('threshold', 2.0) # Default 2 sigma
return abs(latest_value - mean) > (anomaly_threshold * std_dev)
def _evaluate_availability_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
"""Evaluate availability-based alert conditions"""
if not rule.target:
return False
# Check if target is in critical state
return rule.target.last_status == 'CRITICAL'
def _evaluate_pattern_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
"""Evaluate pattern-based alert conditions"""
# This would integrate with pattern detection algorithms
# For now, return False as placeholder
return False
def _create_alert(self, rule: AlertRule) -> Dict[str, Any]:
"""Create an alert instance"""
# Get current value for context
current_value = None
threshold_value = None
if rule.metric:
latest_measurement = MetricMeasurement.objects.filter(
metric=rule.metric
).order_by('-timestamp').first()
if latest_measurement:
current_value = float(latest_measurement.value)
threshold_value = rule.metric.critical_threshold
# Create alert
alert = Alert.objects.create(
rule=rule,
title=f"{rule.name} - {rule.severity}",
description=self._generate_alert_description(rule, current_value, threshold_value),
severity=rule.severity,
triggered_value=current_value,
threshold_value=threshold_value,
context_data={
'rule_id': str(rule.id),
'metric_name': rule.metric.name if rule.metric else None,
'target_name': rule.target.name if rule.target else None,
'condition': rule.condition
}
)
return {
'alert_id': str(alert.id),
'rule_name': rule.name,
'severity': rule.severity,
'title': alert.title,
'description': alert.description,
'current_value': current_value,
'threshold_value': threshold_value
}
def _generate_alert_description(self, rule: AlertRule, current_value: Optional[float], threshold_value: Optional[float]) -> str:
"""Generate alert description"""
description = f"Alert rule '{rule.name}' has been triggered.\n"
if rule.metric and current_value is not None:
description += f"Current value: {current_value} {rule.metric.unit}\n"
if threshold_value is not None:
description += f"Threshold: {threshold_value} {rule.metric.unit if rule.metric else ''}\n"
if rule.target:
description += f"Target: {rule.target.name}\n"
description += f"Severity: {rule.severity}\n"
description += f"Time: {timezone.now().strftime('%Y-%m-%d %H:%M:%S')}"
return description
class NotificationService:
"""Service for sending alert notifications"""
def __init__(self):
self.evaluator = AlertEvaluator()
def send_alert_notifications(self, alert_data: Dict[str, Any]) -> Dict[str, Any]:
"""Send notifications for an alert"""
results = {}
# Get alert rule to determine notification channels
rule_id = alert_data.get('rule_id')
if not rule_id:
return {'error': 'No rule ID provided'}
try:
rule = AlertRule.objects.get(id=rule_id)
except AlertRule.DoesNotExist:
return {'error': 'Alert rule not found'}
notification_channels = rule.notification_channels or []
for channel in notification_channels:
try:
if channel['type'] == 'EMAIL':
result = self._send_email_notification(alert_data, channel)
elif channel['type'] == 'SLACK':
result = self._send_slack_notification(alert_data, channel)
elif channel['type'] == 'WEBHOOK':
result = self._send_webhook_notification(alert_data, channel)
else:
result = {'error': f'Unknown notification channel type: {channel["type"]}'}
results[channel['type']] = result
except Exception as e:
logger.error(f"Failed to send {channel['type']} notification: {e}")
results[channel['type']] = {'error': str(e)}
return results
def _send_email_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
"""Send email notification"""
try:
recipients = channel.get('recipients', [])
if not recipients:
return {'error': 'No email recipients configured'}
subject = f"[{alert_data.get('severity', 'ALERT')}] {alert_data.get('title', 'System Alert')}"
message = alert_data.get('description', '')
send_mail(
subject=subject,
message=message,
from_email=settings.DEFAULT_FROM_EMAIL,
recipient_list=recipients,
fail_silently=False
)
return {'status': 'sent', 'recipients': recipients}
except Exception as e:
return {'error': str(e)}
def _send_slack_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
"""Send Slack notification"""
try:
webhook_url = channel.get('webhook_url')
if not webhook_url:
return {'error': 'No Slack webhook URL configured'}
# Create Slack message
color = self._get_slack_color(alert_data.get('severity', 'MEDIUM'))
slack_message = {
"text": alert_data.get('title', 'System Alert'),
"attachments": [
{
"color": color,
"fields": [
{
"title": "Description",
"value": alert_data.get('description', ''),
"short": False
},
{
"title": "Severity",
"value": alert_data.get('severity', 'UNKNOWN'),
"short": True
},
{
"title": "Time",
"value": timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
"short": True
}
]
}
]
}
# Send to Slack (would use requests in real implementation)
# requests.post(webhook_url, json=slack_message)
return {'status': 'sent', 'channel': channel.get('channel', '#alerts')}
except Exception as e:
return {'error': str(e)}
def _send_webhook_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
"""Send webhook notification"""
try:
webhook_url = channel.get('url')
if not webhook_url:
return {'error': 'No webhook URL configured'}
# Prepare webhook payload
payload = {
'alert': alert_data,
'timestamp': timezone.now().isoformat(),
'source': 'ETB-API-Monitoring'
}
# Send webhook (would use requests in real implementation)
# requests.post(webhook_url, json=payload)
return {'status': 'sent', 'url': webhook_url}
except Exception as e:
return {'error': str(e)}
def _get_slack_color(self, severity: str) -> str:
"""Get Slack color based on severity"""
color_map = {
'LOW': 'good',
'MEDIUM': 'warning',
'HIGH': 'danger',
'CRITICAL': 'danger'
}
return color_map.get(severity, 'warning')
class AlertingService:
"""Main alerting service that coordinates alert evaluation and notification"""
def __init__(self):
self.evaluator = AlertEvaluator()
self.notification_service = NotificationService()
def run_alert_evaluation(self) -> Dict[str, Any]:
"""Run alert evaluation and send notifications"""
results = {
'evaluated_rules': 0,
'triggered_alerts': 0,
'notifications_sent': 0,
'errors': []
}
try:
# Evaluate all alert rules
triggered_alerts = self.evaluator.evaluate_alert_rules()
results['triggered_alerts'] = len(triggered_alerts)
# Send notifications for triggered alerts
for alert_data in triggered_alerts:
try:
notification_results = self.notification_service.send_alert_notifications(alert_data)
results['notifications_sent'] += 1
except Exception as e:
logger.error(f"Failed to send notifications for alert {alert_data.get('alert_id')}: {e}")
results['errors'].append(str(e))
# Count evaluated rules
results['evaluated_rules'] = AlertRule.objects.filter(
status='ACTIVE',
is_enabled=True
).count()
except Exception as e:
logger.error(f"Alert evaluation failed: {e}")
results['errors'].append(str(e))
return results
def acknowledge_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
"""Acknowledge an alert"""
try:
alert = Alert.objects.get(id=alert_id)
alert.status = 'ACKNOWLEDGED'
alert.acknowledged_by = user
alert.acknowledged_at = timezone.now()
alert.save()
return {
'status': 'success',
'message': f'Alert {alert_id} acknowledged by {user.username}'
}
except Alert.DoesNotExist:
return {
'status': 'error',
'message': f'Alert {alert_id} not found'
}
except Exception as e:
return {
'status': 'error',
'message': str(e)
}
def resolve_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
"""Resolve an alert"""
try:
alert = Alert.objects.get(id=alert_id)
alert.status = 'RESOLVED'
alert.resolved_by = user
alert.resolved_at = timezone.now()
alert.save()
return {
'status': 'success',
'message': f'Alert {alert_id} resolved by {user.username}'
}
except Alert.DoesNotExist:
return {
'status': 'error',
'message': f'Alert {alert_id} not found'
}
except Exception as e:
return {
'status': 'error',
'message': str(e)
}
def get_active_alerts(self, severity: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get active alerts"""
alerts = Alert.objects.filter(status='TRIGGERED')
if severity:
alerts = alerts.filter(severity=severity)
return [
{
'id': str(alert.id),
'title': alert.title,
'description': alert.description,
'severity': alert.severity,
'triggered_at': alert.triggered_at,
'rule_name': alert.rule.name,
'current_value': float(alert.triggered_value) if alert.triggered_value else None,
'threshold_value': float(alert.threshold_value) if alert.threshold_value else None
}
for alert in alerts.order_by('-triggered_at')
]

View File

@@ -0,0 +1,372 @@
"""
Health check services for monitoring system components
"""
import time
import requests
import psutil
import logging
from typing import Dict, Any, Optional, Tuple
from django.conf import settings
from django.db import connection
from django.core.cache import cache
from django.utils import timezone
from celery import current_app as celery_app
logger = logging.getLogger(__name__)
class BaseHealthCheck:
"""Base class for health checks"""
def __init__(self, target):
self.target = target
self.start_time = None
self.end_time = None
def execute(self) -> Dict[str, Any]:
"""Execute the health check and return results"""
self.start_time = time.time()
try:
result = self._perform_check()
self.end_time = time.time()
result.update({
'response_time_ms': int((self.end_time - self.start_time) * 1000),
'checked_at': timezone.now(),
'error_message': None
})
return result
except Exception as e:
self.end_time = time.time()
logger.error(f"Health check failed for {self.target.name}: {e}")
return {
'status': 'CRITICAL',
'response_time_ms': int((self.end_time - self.start_time) * 1000),
'checked_at': timezone.now(),
'error_message': str(e)
}
def _perform_check(self) -> Dict[str, Any]:
"""Override in subclasses to implement specific checks"""
raise NotImplementedError
class HTTPHealthCheck(BaseHealthCheck):
"""HTTP-based health check"""
def _perform_check(self) -> Dict[str, Any]:
url = self.target.endpoint_url
if not url:
raise ValueError("No endpoint URL configured")
timeout = self.target.timeout_seconds
expected_codes = self.target.expected_status_codes or [200]
response = requests.get(url, timeout=timeout)
if response.status_code in expected_codes:
status = 'HEALTHY'
elif response.status_code >= 500:
status = 'CRITICAL'
else:
status = 'WARNING'
return {
'status': status,
'status_code': response.status_code,
'response_body': response.text[:1000] # Limit response body size
}
class DatabaseHealthCheck(BaseHealthCheck):
"""Database connection health check"""
def _perform_check(self) -> Dict[str, Any]:
try:
with connection.cursor() as cursor:
cursor.execute("SELECT 1")
result = cursor.fetchone()
if result and result[0] == 1:
return {
'status': 'HEALTHY',
'status_code': 200
}
else:
return {
'status': 'CRITICAL',
'status_code': 500,
'error_message': 'Database query returned unexpected result'
}
except Exception as e:
return {
'status': 'CRITICAL',
'status_code': 500,
'error_message': f'Database connection failed: {str(e)}'
}
class CacheHealthCheck(BaseHealthCheck):
"""Cache system health check"""
def _perform_check(self) -> Dict[str, Any]:
try:
# Test cache write/read
test_key = f"health_check_{int(time.time())}"
test_value = "health_check_value"
cache.set(test_key, test_value, timeout=10)
retrieved_value = cache.get(test_key)
if retrieved_value == test_value:
cache.delete(test_key) # Clean up
return {
'status': 'HEALTHY',
'status_code': 200
}
else:
return {
'status': 'CRITICAL',
'status_code': 500,
'error_message': 'Cache read/write test failed'
}
except Exception as e:
return {
'status': 'CRITICAL',
'status_code': 500,
'error_message': f'Cache operation failed: {str(e)}'
}
class CeleryHealthCheck(BaseHealthCheck):
"""Celery worker health check"""
def _perform_check(self) -> Dict[str, Any]:
try:
# Check if Celery workers are active
inspect = celery_app.control.inspect()
active_workers = inspect.active()
if active_workers:
worker_count = len(active_workers)
return {
'status': 'HEALTHY',
'status_code': 200,
'response_body': f'Active workers: {worker_count}'
}
else:
return {
'status': 'CRITICAL',
'status_code': 500,
'error_message': 'No active Celery workers found'
}
except Exception as e:
return {
'status': 'CRITICAL',
'status_code': 500,
'error_message': f'Celery health check failed: {str(e)}'
}
class SystemResourceHealthCheck(BaseHealthCheck):
"""System resource health check"""
def _perform_check(self) -> Dict[str, Any]:
try:
# Get system metrics
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage('/')
# Determine status based on thresholds
status = 'HEALTHY'
if cpu_percent > 90 or memory.percent > 90 or disk.percent > 90:
status = 'CRITICAL'
elif cpu_percent > 80 or memory.percent > 80 or disk.percent > 80:
status = 'WARNING'
return {
'status': status,
'status_code': 200,
'cpu_usage_percent': cpu_percent,
'memory_usage_percent': memory.percent,
'disk_usage_percent': disk.percent,
'response_body': f'CPU: {cpu_percent}%, Memory: {memory.percent}%, Disk: {disk.percent}%'
}
except Exception as e:
return {
'status': 'CRITICAL',
'status_code': 500,
'error_message': f'System resource check failed: {str(e)}'
}
class ModuleHealthCheck(BaseHealthCheck):
"""Django module health check"""
def _perform_check(self) -> Dict[str, Any]:
try:
module_name = self.target.related_module
if not module_name:
raise ValueError("No module specified for module health check")
# Import the module to check if it's accessible
__import__(module_name)
# Check if module has required models/views
from django.apps import apps
app_config = apps.get_app_config(module_name)
if app_config:
return {
'status': 'HEALTHY',
'status_code': 200,
'response_body': f'Module {module_name} is accessible'
}
else:
return {
'status': 'WARNING',
'status_code': 200,
'error_message': f'Module {module_name} not found in Django apps'
}
except Exception as e:
return {
'status': 'CRITICAL',
'status_code': 500,
'error_message': f'Module health check failed: {str(e)}'
}
class HealthCheckFactory:
"""Factory for creating health check instances"""
CHECK_CLASSES = {
'HTTP': HTTPHealthCheck,
'DATABASE': DatabaseHealthCheck,
'CACHE': CacheHealthCheck,
'QUEUE': CeleryHealthCheck,
'CUSTOM': BaseHealthCheck,
'PING': HTTPHealthCheck, # Use HTTP for ping
'SSL': HTTPHealthCheck, # Use HTTP for SSL
}
@classmethod
def create_health_check(cls, target, check_type: str) -> BaseHealthCheck:
"""Create a health check instance based on type"""
check_class = cls.CHECK_CLASSES.get(check_type, BaseHealthCheck)
return check_class(target)
@classmethod
def get_available_check_types(cls) -> list:
"""Get list of available health check types"""
return list(cls.CHECK_CLASSES.keys())
class HealthCheckService:
"""Service for managing health checks"""
def __init__(self):
self.factory = HealthCheckFactory()
def execute_health_check(self, target, check_type: str) -> Dict[str, Any]:
"""Execute a health check for a target"""
health_check = self.factory.create_health_check(target, check_type)
return health_check.execute()
def execute_all_health_checks(self) -> Dict[str, Any]:
"""Execute health checks for all active targets"""
from monitoring.models import MonitoringTarget, HealthCheck
results = {}
active_targets = MonitoringTarget.objects.filter(
status='ACTIVE',
health_check_enabled=True
)
for target in active_targets:
try:
# Determine check type based on target type
check_type = self._get_check_type_for_target(target)
# Execute health check
result = self.execute_health_check(target, check_type)
# Save result to database
HealthCheck.objects.create(
target=target,
check_type=check_type,
status=result['status'],
response_time_ms=result.get('response_time_ms'),
status_code=result.get('status_code'),
response_body=result.get('response_body'),
error_message=result.get('error_message'),
cpu_usage_percent=result.get('cpu_usage_percent'),
memory_usage_percent=result.get('memory_usage_percent'),
disk_usage_percent=result.get('disk_usage_percent')
)
# Update target status
target.last_checked = timezone.now()
target.last_status = result['status']
target.save(update_fields=['last_checked', 'last_status'])
results[target.name] = result
except Exception as e:
logger.error(f"Failed to execute health check for {target.name}: {e}")
results[target.name] = {
'status': 'CRITICAL',
'error_message': str(e)
}
return results
def _get_check_type_for_target(self, target) -> str:
"""Determine the appropriate check type for a target"""
target_type_mapping = {
'APPLICATION': 'HTTP',
'DATABASE': 'DATABASE',
'CACHE': 'CACHE',
'QUEUE': 'QUEUE',
'EXTERNAL_API': 'HTTP',
'SERVICE': 'HTTP',
'INFRASTRUCTURE': 'HTTP',
'MODULE': 'CUSTOM',
}
return target_type_mapping.get(target.target_type, 'HTTP')
def get_system_health_summary(self) -> Dict[str, Any]:
"""Get overall system health summary"""
from monitoring.models import HealthCheck, MonitoringTarget
# Get latest health check for each target
latest_checks = HealthCheck.objects.filter(
target__status='ACTIVE'
).order_by('target', '-checked_at').distinct('target')
total_targets = MonitoringTarget.objects.filter(status='ACTIVE').count()
healthy_targets = latest_checks.filter(status='HEALTHY').count()
warning_targets = latest_checks.filter(status='WARNING').count()
critical_targets = latest_checks.filter(status='CRITICAL').count()
# Calculate overall status
if critical_targets > 0:
overall_status = 'CRITICAL'
elif warning_targets > 0:
overall_status = 'WARNING'
elif healthy_targets == total_targets:
overall_status = 'HEALTHY'
else:
overall_status = 'UNKNOWN'
return {
'overall_status': overall_status,
'total_targets': total_targets,
'healthy_targets': healthy_targets,
'warning_targets': warning_targets,
'critical_targets': critical_targets,
'health_percentage': (healthy_targets / total_targets * 100) if total_targets > 0 else 0,
'last_updated': timezone.now()
}

View File

@@ -0,0 +1,364 @@
"""
Metrics collection service for system monitoring
"""
import time
import logging
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from django.utils import timezone
from django.db import connection
from django.core.cache import cache
from django.conf import settings
from django.contrib.auth import get_user_model
from monitoring.models import SystemMetric, MetricMeasurement
User = get_user_model()
logger = logging.getLogger(__name__)
class MetricsCollector:
"""Service for collecting and storing system metrics"""
def __init__(self):
self.collected_metrics = {}
def collect_all_metrics(self) -> Dict[str, Any]:
"""Collect all configured metrics"""
results = {}
# Get all active metrics
active_metrics = SystemMetric.objects.filter(is_active=True)
for metric in active_metrics:
try:
value = self._collect_metric_value(metric)
if value is not None:
# Store measurement
measurement = MetricMeasurement.objects.create(
metric=metric,
value=value,
tags=self._get_metric_tags(metric),
metadata=self._get_metric_metadata(metric)
)
results[metric.name] = {
'value': value,
'measurement_id': measurement.id,
'timestamp': measurement.timestamp
}
except Exception as e:
logger.error(f"Failed to collect metric {metric.name}: {e}")
results[metric.name] = {
'error': str(e)
}
return results
def _collect_metric_value(self, metric: SystemMetric) -> Optional[float]:
"""Collect value for a specific metric"""
category = metric.category
if category == 'API_RESPONSE_TIME':
return self._collect_api_response_time(metric)
elif category == 'THROUGHPUT':
return self._collect_throughput(metric)
elif category == 'ERROR_RATE':
return self._collect_error_rate(metric)
elif category == 'AVAILABILITY':
return self._collect_availability(metric)
elif category == 'INCIDENT_COUNT':
return self._collect_incident_count(metric)
elif category == 'MTTR':
return self._collect_mttr(metric)
elif category == 'MTTA':
return self._collect_mtta(metric)
elif category == 'SLA_COMPLIANCE':
return self._collect_sla_compliance(metric)
elif category == 'SECURITY_EVENTS':
return self._collect_security_events(metric)
elif category == 'AUTOMATION_SUCCESS':
return self._collect_automation_success(metric)
elif category == 'AI_ACCURACY':
return self._collect_ai_accuracy(metric)
elif category == 'COST_IMPACT':
return self._collect_cost_impact(metric)
elif category == 'USER_ACTIVITY':
return self._collect_user_activity(metric)
elif category == 'SYSTEM_RESOURCES':
return self._collect_system_resources(metric)
else:
logger.warning(f"Unknown metric category: {category}")
return None
def _collect_api_response_time(self, metric: SystemMetric) -> Optional[float]:
"""Collect API response time metrics"""
# This would typically come from middleware or APM tools
# For now, return a mock value
return 150.5 # milliseconds
def _collect_throughput(self, metric: SystemMetric) -> Optional[float]:
"""Collect throughput metrics (requests per minute)"""
# Count requests in the last minute
# This would typically come from access logs or middleware
return 120.0 # requests per minute
def _collect_error_rate(self, metric: SystemMetric) -> Optional[float]:
"""Collect error rate metrics"""
# Count errors in the last hour
# This would typically come from logs or error tracking
return 0.02 # 2% error rate
def _collect_availability(self, metric: SystemMetric) -> Optional[float]:
"""Collect availability metrics"""
# Calculate availability percentage
# This would typically come from uptime monitoring
return 99.9 # 99.9% availability
def _collect_incident_count(self, metric: SystemMetric) -> Optional[float]:
"""Collect incident count metrics"""
from incident_intelligence.models import Incident
# Count incidents in the last 24 hours
since = timezone.now() - timedelta(hours=24)
count = Incident.objects.filter(created_at__gte=since).count()
return float(count)
def _collect_mttr(self, metric: SystemMetric) -> Optional[float]:
"""Collect Mean Time to Resolve metrics"""
from incident_intelligence.models import Incident
# Calculate MTTR for resolved incidents in the last 7 days
since = timezone.now() - timedelta(days=7)
resolved_incidents = Incident.objects.filter(
status__in=['RESOLVED', 'CLOSED'],
resolved_at__isnull=False,
resolved_at__gte=since
)
if not resolved_incidents.exists():
return None
total_resolution_time = 0
count = 0
for incident in resolved_incidents:
if incident.resolved_at and incident.created_at:
resolution_time = incident.resolved_at - incident.created_at
total_resolution_time += resolution_time.total_seconds()
count += 1
if count > 0:
return total_resolution_time / count / 60 # Convert to minutes
return None
def _collect_mtta(self, metric: SystemMetric) -> Optional[float]:
"""Collect Mean Time to Acknowledge metrics"""
# This would require tracking when incidents are first acknowledged
# For now, return a mock value
return 15.5 # minutes
def _collect_sla_compliance(self, metric: SystemMetric) -> Optional[float]:
"""Collect SLA compliance metrics"""
from sla_oncall.models import SLAInstance
# Calculate SLA compliance percentage
total_slas = SLAInstance.objects.count()
if total_slas == 0:
return None
# This would require more complex SLA compliance calculation
# For now, return a mock value
return 95.5 # 95.5% SLA compliance
def _collect_security_events(self, metric: SystemMetric) -> Optional[float]:
"""Collect security events metrics"""
# Count security events in the last hour
# This would come from security logs or audit trails
return 3.0 # 3 security events in the last hour
def _collect_automation_success(self, metric: SystemMetric) -> Optional[float]:
"""Collect automation success rate metrics"""
from automation_orchestration.models import RunbookExecution
# Calculate success rate for runbook executions in the last 24 hours
since = timezone.now() - timedelta(hours=24)
executions = RunbookExecution.objects.filter(created_at__gte=since)
if not executions.exists():
return None
successful = executions.filter(status='COMPLETED').count()
total = executions.count()
return (successful / total * 100) if total > 0 else None
def _collect_ai_accuracy(self, metric: SystemMetric) -> Optional[float]:
"""Collect AI model accuracy metrics"""
from incident_intelligence.models import IncidentClassification
# Calculate accuracy for AI classifications
classifications = IncidentClassification.objects.all()
if not classifications.exists():
return None
# This would require comparing predictions with actual outcomes
# For now, return average confidence score
total_confidence = sum(c.confidence_score for c in classifications)
return (total_confidence / classifications.count() * 100) if classifications.count() > 0 else None
def _collect_cost_impact(self, metric: SystemMetric) -> Optional[float]:
"""Collect cost impact metrics"""
from analytics_predictive_insights.models import CostImpactAnalysis
# Calculate total cost impact for the last 30 days
since = timezone.now() - timedelta(days=30)
cost_analyses = CostImpactAnalysis.objects.filter(created_at__gte=since)
total_cost = sum(float(ca.cost_amount) for ca in cost_analyses)
return total_cost
def _collect_user_activity(self, metric: SystemMetric) -> Optional[float]:
"""Collect user activity metrics"""
# Count active users in the last hour
since = timezone.now() - timedelta(hours=1)
# This would require user activity tracking
return 25.0 # 25 active users in the last hour
def _collect_system_resources(self, metric: SystemMetric) -> Optional[float]:
"""Collect system resource metrics"""
import psutil
# Get CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
return cpu_percent
def _get_metric_tags(self, metric: SystemMetric) -> Dict[str, str]:
"""Get tags for a metric measurement"""
tags = {
'metric_type': metric.metric_type,
'category': metric.category,
}
if metric.related_module:
tags['module'] = metric.related_module
return tags
def _get_metric_metadata(self, metric: SystemMetric) -> Dict[str, Any]:
"""Get metadata for a metric measurement"""
return {
'unit': metric.unit,
'aggregation_method': metric.aggregation_method,
'collection_interval': metric.collection_interval_seconds,
}
class MetricsAggregator:
"""Service for aggregating metrics over time periods"""
def __init__(self):
self.collector = MetricsCollector()
def aggregate_metrics(self, metric: SystemMetric, start_time: datetime, end_time: datetime) -> Dict[str, Any]:
"""Aggregate metrics over a time period"""
measurements = MetricMeasurement.objects.filter(
metric=metric,
timestamp__gte=start_time,
timestamp__lte=end_time
).order_by('timestamp')
if not measurements.exists():
return {
'count': 0,
'values': [],
'aggregated_value': None
}
values = [float(m.value) for m in measurements]
aggregated_value = self._aggregate_values(values, metric.aggregation_method)
return {
'count': len(values),
'values': values,
'aggregated_value': aggregated_value,
'start_time': start_time,
'end_time': end_time,
'unit': metric.unit
}
def _aggregate_values(self, values: List[float], method: str) -> Optional[float]:
"""Aggregate a list of values using the specified method"""
if not values:
return None
if method == 'AVERAGE':
return sum(values) / len(values)
elif method == 'SUM':
return sum(values)
elif method == 'COUNT':
return len(values)
elif method == 'MIN':
return min(values)
elif method == 'MAX':
return max(values)
elif method == 'PERCENTILE_95':
return self._calculate_percentile(values, 95)
elif method == 'PERCENTILE_99':
return self._calculate_percentile(values, 99)
else:
return sum(values) / len(values) # Default to average
def _calculate_percentile(self, values: List[float], percentile: int) -> float:
"""Calculate percentile of values"""
sorted_values = sorted(values)
index = int((percentile / 100) * len(sorted_values))
return sorted_values[min(index, len(sorted_values) - 1)]
def get_metric_trends(self, metric: SystemMetric, days: int = 7) -> Dict[str, Any]:
"""Get metric trends over a period"""
end_time = timezone.now()
start_time = end_time - timedelta(days=days)
# Get daily aggregations
daily_data = []
for i in range(days):
day_start = start_time + timedelta(days=i)
day_end = day_start + timedelta(days=1)
day_aggregation = self.aggregate_metrics(metric, day_start, day_end)
daily_data.append({
'date': day_start.date(),
'value': day_aggregation['aggregated_value'],
'count': day_aggregation['count']
})
return {
'metric_name': metric.name,
'period_days': days,
'daily_data': daily_data,
'trend': self._calculate_trend([d['value'] for d in daily_data if d['value'] is not None])
}
def _calculate_trend(self, values: List[float]) -> str:
"""Calculate trend direction from values"""
if len(values) < 2:
return 'STABLE'
# Simple linear trend calculation
first_half = values[:len(values)//2]
second_half = values[len(values)//2:]
first_avg = sum(first_half) / len(first_half)
second_avg = sum(second_half) / len(second_half)
change_percent = ((second_avg - first_avg) / first_avg) * 100 if first_avg != 0 else 0
if change_percent > 5:
return 'INCREASING'
elif change_percent < -5:
return 'DECREASING'
else:
return 'STABLE'