Updates

2025-09-19 11:58:53 +03:00
parent 306b20e24a
commit 6b247e5b9f
11423 changed files with 1500615 additions and 778 deletions
--- a/ETB-API/monitoring/services/init.py
+++ b/ETB-API/monitoring/services/init.py
@@ -0,0 +1 @@
+# Monitoring services
--- a/ETB-API/monitoring/services/pycache/init.cpython-312.pyc
+++ b/ETB-API/monitoring/services/pycache/init.cpython-312.pyc
--- a/ETB-API/monitoring/services/pycache/alerting.cpython-312.pyc
+++ b/ETB-API/monitoring/services/pycache/alerting.cpython-312.pyc
--- a/ETB-API/monitoring/services/pycache/health_checks.cpython-312.pyc
+++ b/ETB-API/monitoring/services/pycache/health_checks.cpython-312.pyc
--- a/ETB-API/monitoring/services/pycache/metrics_collector.cpython-312.pyc
+++ b/ETB-API/monitoring/services/pycache/metrics_collector.cpython-312.pyc
--- a/ETB-API/monitoring/services/alerting.py
+++ b/ETB-API/monitoring/services/alerting.py
@@ -0,0 +1,449 @@
+"""
+Alerting service for monitoring system
+"""
+import logging
+from typing import Dict, Any, List, Optional
+from datetime import datetime, timedelta
+from django.utils import timezone
+from django.core.mail import send_mail
+from django.conf import settings
+from django.contrib.auth import get_user_model
+
+from monitoring.models import AlertRule, Alert, SystemMetric, MetricMeasurement, MonitoringTarget
+
+User = get_user_model()
+logger = logging.getLogger(__name__)
+
+
+class AlertEvaluator:
+    """Service for evaluating alert conditions"""
+    
+    def __init__(self):
+        self.aggregator = None  # Will be imported to avoid circular imports
+    
+    def evaluate_alert_rules(self) -> List[Dict[str, Any]]:
+        """Evaluate all active alert rules"""
+        triggered_alerts = []
+        
+        active_rules = AlertRule.objects.filter(
+            status='ACTIVE',
+            is_enabled=True
+        )
+        
+        for rule in active_rules:
+            try:
+                if self._evaluate_rule(rule):
+                    alert_data = self._create_alert(rule)
+                    triggered_alerts.append(alert_data)
+            except Exception as e:
+                logger.error(f"Failed to evaluate alert rule {rule.name}: {e}")
+        
+        return triggered_alerts
+    
+    def _evaluate_rule(self, rule: AlertRule) -> bool:
+        """Evaluate if an alert rule condition is met"""
+        condition = rule.condition
+        condition_type = condition.get('type')
+        
+        if condition_type == 'THRESHOLD':
+            return self._evaluate_threshold_condition(rule, condition)
+        elif condition_type == 'ANOMALY':
+            return self._evaluate_anomaly_condition(rule, condition)
+        elif condition_type == 'AVAILABILITY':
+            return self._evaluate_availability_condition(rule, condition)
+        elif condition_type == 'PATTERN':
+            return self._evaluate_pattern_condition(rule, condition)
+        else:
+            logger.warning(f"Unknown condition type: {condition_type}")
+            return False
+    
+    def _evaluate_threshold_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
+        """Evaluate threshold-based alert conditions"""
+        if not rule.metric:
+            return False
+        
+        # Get latest metric value
+        latest_measurement = MetricMeasurement.objects.filter(
+            metric=rule.metric
+        ).order_by('-timestamp').first()
+        
+        if not latest_measurement:
+            return False
+        
+        current_value = float(latest_measurement.value)
+        threshold_value = condition.get('threshold')
+        operator = condition.get('operator', '>')
+        
+        if operator == '>':
+            return current_value > threshold_value
+        elif operator == '>=':
+            return current_value >= threshold_value
+        elif operator == '<':
+            return current_value < threshold_value
+        elif operator == '<=':
+            return current_value <= threshold_value
+        elif operator == '==':
+            return current_value == threshold_value
+        elif operator == '!=':
+            return current_value != threshold_value
+        else:
+            logger.warning(f"Unknown operator: {operator}")
+            return False
+    
+    def _evaluate_anomaly_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
+        """Evaluate anomaly-based alert conditions"""
+        # This would integrate with anomaly detection models
+        # For now, implement a simple statistical anomaly detection
+        
+        if not rule.metric:
+            return False
+        
+        # Get recent measurements
+        since = timezone.now() - timedelta(hours=24)
+        measurements = MetricMeasurement.objects.filter(
+            metric=rule.metric,
+            timestamp__gte=since
+        ).order_by('-timestamp')[:100]  # Last 100 measurements
+        
+        if len(measurements) < 10:  # Need minimum data points
+            return False
+        
+        values = [float(m.value) for m in measurements]
+        
+        # Calculate mean and standard deviation
+        mean = sum(values) / len(values)
+        variance = sum((x - mean) ** 2 for x in values) / len(values)
+        std_dev = variance ** 0.5
+        
+        # Check if latest value is an anomaly (more than 2 standard deviations)
+        latest_value = values[0]
+        anomaly_threshold = condition.get('threshold', 2.0)  # Default 2 sigma
+        
+        return abs(latest_value - mean) > (anomaly_threshold * std_dev)
+    
+    def _evaluate_availability_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
+        """Evaluate availability-based alert conditions"""
+        if not rule.target:
+            return False
+        
+        # Check if target is in critical state
+        return rule.target.last_status == 'CRITICAL'
+    
+    def _evaluate_pattern_condition(self, rule: AlertRule, condition: Dict[str, Any]) -> bool:
+        """Evaluate pattern-based alert conditions"""
+        # This would integrate with pattern detection algorithms
+        # For now, return False as placeholder
+        return False
+    
+    def _create_alert(self, rule: AlertRule) -> Dict[str, Any]:
+        """Create an alert instance"""
+        # Get current value for context
+        current_value = None
+        threshold_value = None
+        
+        if rule.metric:
+            latest_measurement = MetricMeasurement.objects.filter(
+                metric=rule.metric
+            ).order_by('-timestamp').first()
+            if latest_measurement:
+                current_value = float(latest_measurement.value)
+                threshold_value = rule.metric.critical_threshold
+        
+        # Create alert
+        alert = Alert.objects.create(
+            rule=rule,
+            title=f"{rule.name} - {rule.severity}",
+            description=self._generate_alert_description(rule, current_value, threshold_value),
+            severity=rule.severity,
+            triggered_value=current_value,
+            threshold_value=threshold_value,
+            context_data={
+                'rule_id': str(rule.id),
+                'metric_name': rule.metric.name if rule.metric else None,
+                'target_name': rule.target.name if rule.target else None,
+                'condition': rule.condition
+            }
+        )
+        
+        return {
+            'alert_id': str(alert.id),
+            'rule_name': rule.name,
+            'severity': rule.severity,
+            'title': alert.title,
+            'description': alert.description,
+            'current_value': current_value,
+            'threshold_value': threshold_value
+        }
+    
+    def _generate_alert_description(self, rule: AlertRule, current_value: Optional[float], threshold_value: Optional[float]) -> str:
+        """Generate alert description"""
+        description = f"Alert rule '{rule.name}' has been triggered.\n"
+        
+        if rule.metric and current_value is not None:
+            description += f"Current value: {current_value} {rule.metric.unit}\n"
+        
+        if threshold_value is not None:
+            description += f"Threshold: {threshold_value} {rule.metric.unit if rule.metric else ''}\n"
+        
+        if rule.target:
+            description += f"Target: {rule.target.name}\n"
+        
+        description += f"Severity: {rule.severity}\n"
+        description += f"Time: {timezone.now().strftime('%Y-%m-%d %H:%M:%S')}"
+        
+        return description
+
+
+class NotificationService:
+    """Service for sending alert notifications"""
+    
+    def __init__(self):
+        self.evaluator = AlertEvaluator()
+    
+    def send_alert_notifications(self, alert_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Send notifications for an alert"""
+        results = {}
+        
+        # Get alert rule to determine notification channels
+        rule_id = alert_data.get('rule_id')
+        if not rule_id:
+            return {'error': 'No rule ID provided'}
+        
+        try:
+            rule = AlertRule.objects.get(id=rule_id)
+        except AlertRule.DoesNotExist:
+            return {'error': 'Alert rule not found'}
+        
+        notification_channels = rule.notification_channels or []
+        
+        for channel in notification_channels:
+            try:
+                if channel['type'] == 'EMAIL':
+                    result = self._send_email_notification(alert_data, channel)
+                elif channel['type'] == 'SLACK':
+                    result = self._send_slack_notification(alert_data, channel)
+                elif channel['type'] == 'WEBHOOK':
+                    result = self._send_webhook_notification(alert_data, channel)
+                else:
+                    result = {'error': f'Unknown notification channel type: {channel["type"]}'}
+                
+                results[channel['type']] = result
+                
+            except Exception as e:
+                logger.error(f"Failed to send {channel['type']} notification: {e}")
+                results[channel['type']] = {'error': str(e)}
+        
+        return results
+    
+    def _send_email_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
+        """Send email notification"""
+        try:
+            recipients = channel.get('recipients', [])
+            if not recipients:
+                return {'error': 'No email recipients configured'}
+            
+            subject = f"[{alert_data.get('severity', 'ALERT')}] {alert_data.get('title', 'System Alert')}"
+            message = alert_data.get('description', '')
+            
+            send_mail(
+                subject=subject,
+                message=message,
+                from_email=settings.DEFAULT_FROM_EMAIL,
+                recipient_list=recipients,
+                fail_silently=False
+            )
+            
+            return {'status': 'sent', 'recipients': recipients}
+            
+        except Exception as e:
+            return {'error': str(e)}
+    
+    def _send_slack_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
+        """Send Slack notification"""
+        try:
+            webhook_url = channel.get('webhook_url')
+            if not webhook_url:
+                return {'error': 'No Slack webhook URL configured'}
+            
+            # Create Slack message
+            color = self._get_slack_color(alert_data.get('severity', 'MEDIUM'))
+            
+            slack_message = {
+                "text": alert_data.get('title', 'System Alert'),
+                "attachments": [
+                    {
+                        "color": color,
+                        "fields": [
+                            {
+                                "title": "Description",
+                                "value": alert_data.get('description', ''),
+                                "short": False
+                            },
+                            {
+                                "title": "Severity",
+                                "value": alert_data.get('severity', 'UNKNOWN'),
+                                "short": True
+                            },
+                            {
+                                "title": "Time",
+                                "value": timezone.now().strftime('%Y-%m-%d %H:%M:%S'),
+                                "short": True
+                            }
+                        ]
+                    }
+                ]
+            }
+            
+            # Send to Slack (would use requests in real implementation)
+            # requests.post(webhook_url, json=slack_message)
+            
+            return {'status': 'sent', 'channel': channel.get('channel', '#alerts')}
+            
+        except Exception as e:
+            return {'error': str(e)}
+    
+    def _send_webhook_notification(self, alert_data: Dict[str, Any], channel: Dict[str, Any]) -> Dict[str, Any]:
+        """Send webhook notification"""
+        try:
+            webhook_url = channel.get('url')
+            if not webhook_url:
+                return {'error': 'No webhook URL configured'}
+            
+            # Prepare webhook payload
+            payload = {
+                'alert': alert_data,
+                'timestamp': timezone.now().isoformat(),
+                'source': 'ETB-API-Monitoring'
+            }
+            
+            # Send webhook (would use requests in real implementation)
+            # requests.post(webhook_url, json=payload)
+            
+            return {'status': 'sent', 'url': webhook_url}
+            
+        except Exception as e:
+            return {'error': str(e)}
+    
+    def _get_slack_color(self, severity: str) -> str:
+        """Get Slack color based on severity"""
+        color_map = {
+            'LOW': 'good',
+            'MEDIUM': 'warning',
+            'HIGH': 'danger',
+            'CRITICAL': 'danger'
+        }
+        return color_map.get(severity, 'warning')
+
+
+class AlertingService:
+    """Main alerting service that coordinates alert evaluation and notification"""
+    
+    def __init__(self):
+        self.evaluator = AlertEvaluator()
+        self.notification_service = NotificationService()
+    
+    def run_alert_evaluation(self) -> Dict[str, Any]:
+        """Run alert evaluation and send notifications"""
+        results = {
+            'evaluated_rules': 0,
+            'triggered_alerts': 0,
+            'notifications_sent': 0,
+            'errors': []
+        }
+        
+        try:
+            # Evaluate all alert rules
+            triggered_alerts = self.evaluator.evaluate_alert_rules()
+            results['triggered_alerts'] = len(triggered_alerts)
+            
+            # Send notifications for triggered alerts
+            for alert_data in triggered_alerts:
+                try:
+                    notification_results = self.notification_service.send_alert_notifications(alert_data)
+                    results['notifications_sent'] += 1
+                except Exception as e:
+                    logger.error(f"Failed to send notifications for alert {alert_data.get('alert_id')}: {e}")
+                    results['errors'].append(str(e))
+            
+            # Count evaluated rules
+            results['evaluated_rules'] = AlertRule.objects.filter(
+                status='ACTIVE',
+                is_enabled=True
+            ).count()
+            
+        except Exception as e:
+            logger.error(f"Alert evaluation failed: {e}")
+            results['errors'].append(str(e))
+        
+        return results
+    
+    def acknowledge_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
+        """Acknowledge an alert"""
+        try:
+            alert = Alert.objects.get(id=alert_id)
+            alert.status = 'ACKNOWLEDGED'
+            alert.acknowledged_by = user
+            alert.acknowledged_at = timezone.now()
+            alert.save()
+            
+            return {
+                'status': 'success',
+                'message': f'Alert {alert_id} acknowledged by {user.username}'
+            }
+            
+        except Alert.DoesNotExist:
+            return {
+                'status': 'error',
+                'message': f'Alert {alert_id} not found'
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'message': str(e)
+            }
+    
+    def resolve_alert(self, alert_id: str, user: User) -> Dict[str, Any]:
+        """Resolve an alert"""
+        try:
+            alert = Alert.objects.get(id=alert_id)
+            alert.status = 'RESOLVED'
+            alert.resolved_by = user
+            alert.resolved_at = timezone.now()
+            alert.save()
+            
+            return {
+                'status': 'success',
+                'message': f'Alert {alert_id} resolved by {user.username}'
+            }
+            
+        except Alert.DoesNotExist:
+            return {
+                'status': 'error',
+                'message': f'Alert {alert_id} not found'
+            }
+        except Exception as e:
+            return {
+                'status': 'error',
+                'message': str(e)
+            }
+    
+    def get_active_alerts(self, severity: Optional[str] = None) -> List[Dict[str, Any]]:
+        """Get active alerts"""
+        alerts = Alert.objects.filter(status='TRIGGERED')
+        
+        if severity:
+            alerts = alerts.filter(severity=severity)
+        
+        return [
+            {
+                'id': str(alert.id),
+                'title': alert.title,
+                'description': alert.description,
+                'severity': alert.severity,
+                'triggered_at': alert.triggered_at,
+                'rule_name': alert.rule.name,
+                'current_value': float(alert.triggered_value) if alert.triggered_value else None,
+                'threshold_value': float(alert.threshold_value) if alert.threshold_value else None
+            }
+            for alert in alerts.order_by('-triggered_at')
+        ]
--- a/ETB-API/monitoring/services/health_checks.py
+++ b/ETB-API/monitoring/services/health_checks.py
@@ -0,0 +1,372 @@
+"""
+Health check services for monitoring system components
+"""
+import time
+import requests
+import psutil
+import logging
+from typing import Dict, Any, Optional, Tuple
+from django.conf import settings
+from django.db import connection
+from django.core.cache import cache
+from django.utils import timezone
+from celery import current_app as celery_app
+
+logger = logging.getLogger(__name__)
+
+
+class BaseHealthCheck:
+    """Base class for health checks"""
+    
+    def __init__(self, target):
+        self.target = target
+        self.start_time = None
+        self.end_time = None
+    
+    def execute(self) -> Dict[str, Any]:
+        """Execute the health check and return results"""
+        self.start_time = time.time()
+        try:
+            result = self._perform_check()
+            self.end_time = time.time()
+            
+            result.update({
+                'response_time_ms': int((self.end_time - self.start_time) * 1000),
+                'checked_at': timezone.now(),
+                'error_message': None
+            })
+            
+            return result
+        except Exception as e:
+            self.end_time = time.time()
+            logger.error(f"Health check failed for {self.target.name}: {e}")
+            return {
+                'status': 'CRITICAL',
+                'response_time_ms': int((self.end_time - self.start_time) * 1000),
+                'checked_at': timezone.now(),
+                'error_message': str(e)
+            }
+    
+    def _perform_check(self) -> Dict[str, Any]:
+        """Override in subclasses to implement specific checks"""
+        raise NotImplementedError
+
+
+class HTTPHealthCheck(BaseHealthCheck):
+    """HTTP-based health check"""
+    
+    def _perform_check(self) -> Dict[str, Any]:
+        url = self.target.endpoint_url
+        if not url:
+            raise ValueError("No endpoint URL configured")
+        
+        timeout = self.target.timeout_seconds
+        expected_codes = self.target.expected_status_codes or [200]
+        
+        response = requests.get(url, timeout=timeout)
+        
+        if response.status_code in expected_codes:
+            status = 'HEALTHY'
+        elif response.status_code >= 500:
+            status = 'CRITICAL'
+        else:
+            status = 'WARNING'
+        
+        return {
+            'status': status,
+            'status_code': response.status_code,
+            'response_body': response.text[:1000]  # Limit response body size
+        }
+
+
+class DatabaseHealthCheck(BaseHealthCheck):
+    """Database connection health check"""
+    
+    def _perform_check(self) -> Dict[str, Any]:
+        try:
+            with connection.cursor() as cursor:
+                cursor.execute("SELECT 1")
+                result = cursor.fetchone()
+                
+            if result and result[0] == 1:
+                return {
+                    'status': 'HEALTHY',
+                    'status_code': 200
+                }
+            else:
+                return {
+                    'status': 'CRITICAL',
+                    'status_code': 500,
+                    'error_message': 'Database query returned unexpected result'
+                }
+        except Exception as e:
+            return {
+                'status': 'CRITICAL',
+                'status_code': 500,
+                'error_message': f'Database connection failed: {str(e)}'
+            }
+
+
+class CacheHealthCheck(BaseHealthCheck):
+    """Cache system health check"""
+    
+    def _perform_check(self) -> Dict[str, Any]:
+        try:
+            # Test cache write/read
+            test_key = f"health_check_{int(time.time())}"
+            test_value = "health_check_value"
+            
+            cache.set(test_key, test_value, timeout=10)
+            retrieved_value = cache.get(test_key)
+            
+            if retrieved_value == test_value:
+                cache.delete(test_key)  # Clean up
+                return {
+                    'status': 'HEALTHY',
+                    'status_code': 200
+                }
+            else:
+                return {
+                    'status': 'CRITICAL',
+                    'status_code': 500,
+                    'error_message': 'Cache read/write test failed'
+                }
+        except Exception as e:
+            return {
+                'status': 'CRITICAL',
+                'status_code': 500,
+                'error_message': f'Cache operation failed: {str(e)}'
+            }
+
+
+class CeleryHealthCheck(BaseHealthCheck):
+    """Celery worker health check"""
+    
+    def _perform_check(self) -> Dict[str, Any]:
+        try:
+            # Check if Celery workers are active
+            inspect = celery_app.control.inspect()
+            active_workers = inspect.active()
+            
+            if active_workers:
+                worker_count = len(active_workers)
+                return {
+                    'status': 'HEALTHY',
+                    'status_code': 200,
+                    'response_body': f'Active workers: {worker_count}'
+                }
+            else:
+                return {
+                    'status': 'CRITICAL',
+                    'status_code': 500,
+                    'error_message': 'No active Celery workers found'
+                }
+        except Exception as e:
+            return {
+                'status': 'CRITICAL',
+                'status_code': 500,
+                'error_message': f'Celery health check failed: {str(e)}'
+            }
+
+
+class SystemResourceHealthCheck(BaseHealthCheck):
+    """System resource health check"""
+    
+    def _perform_check(self) -> Dict[str, Any]:
+        try:
+            # Get system metrics
+            cpu_percent = psutil.cpu_percent(interval=1)
+            memory = psutil.virtual_memory()
+            disk = psutil.disk_usage('/')
+            
+            # Determine status based on thresholds
+            status = 'HEALTHY'
+            if cpu_percent > 90 or memory.percent > 90 or disk.percent > 90:
+                status = 'CRITICAL'
+            elif cpu_percent > 80 or memory.percent > 80 or disk.percent > 80:
+                status = 'WARNING'
+            
+            return {
+                'status': status,
+                'status_code': 200,
+                'cpu_usage_percent': cpu_percent,
+                'memory_usage_percent': memory.percent,
+                'disk_usage_percent': disk.percent,
+                'response_body': f'CPU: {cpu_percent}%, Memory: {memory.percent}%, Disk: {disk.percent}%'
+            }
+        except Exception as e:
+            return {
+                'status': 'CRITICAL',
+                'status_code': 500,
+                'error_message': f'System resource check failed: {str(e)}'
+            }
+
+
+class ModuleHealthCheck(BaseHealthCheck):
+    """Django module health check"""
+    
+    def _perform_check(self) -> Dict[str, Any]:
+        try:
+            module_name = self.target.related_module
+            if not module_name:
+                raise ValueError("No module specified for module health check")
+            
+            # Import the module to check if it's accessible
+            __import__(module_name)
+            
+            # Check if module has required models/views
+            from django.apps import apps
+            app_config = apps.get_app_config(module_name)
+            
+            if app_config:
+                return {
+                    'status': 'HEALTHY',
+                    'status_code': 200,
+                    'response_body': f'Module {module_name} is accessible'
+                }
+            else:
+                return {
+                    'status': 'WARNING',
+                    'status_code': 200,
+                    'error_message': f'Module {module_name} not found in Django apps'
+                }
+        except Exception as e:
+            return {
+                'status': 'CRITICAL',
+                'status_code': 500,
+                'error_message': f'Module health check failed: {str(e)}'
+            }
+
+
+class HealthCheckFactory:
+    """Factory for creating health check instances"""
+    
+    CHECK_CLASSES = {
+        'HTTP': HTTPHealthCheck,
+        'DATABASE': DatabaseHealthCheck,
+        'CACHE': CacheHealthCheck,
+        'QUEUE': CeleryHealthCheck,
+        'CUSTOM': BaseHealthCheck,
+        'PING': HTTPHealthCheck,  # Use HTTP for ping
+        'SSL': HTTPHealthCheck,   # Use HTTP for SSL
+    }
+    
+    @classmethod
+    def create_health_check(cls, target, check_type: str) -> BaseHealthCheck:
+        """Create a health check instance based on type"""
+        check_class = cls.CHECK_CLASSES.get(check_type, BaseHealthCheck)
+        return check_class(target)
+    
+    @classmethod
+    def get_available_check_types(cls) -> list:
+        """Get list of available health check types"""
+        return list(cls.CHECK_CLASSES.keys())
+
+
+class HealthCheckService:
+    """Service for managing health checks"""
+    
+    def __init__(self):
+        self.factory = HealthCheckFactory()
+    
+    def execute_health_check(self, target, check_type: str) -> Dict[str, Any]:
+        """Execute a health check for a target"""
+        health_check = self.factory.create_health_check(target, check_type)
+        return health_check.execute()
+    
+    def execute_all_health_checks(self) -> Dict[str, Any]:
+        """Execute health checks for all active targets"""
+        from monitoring.models import MonitoringTarget, HealthCheck
+        
+        results = {}
+        active_targets = MonitoringTarget.objects.filter(
+            status='ACTIVE',
+            health_check_enabled=True
+        )
+        
+        for target in active_targets:
+            try:
+                # Determine check type based on target type
+                check_type = self._get_check_type_for_target(target)
+                
+                # Execute health check
+                result = self.execute_health_check(target, check_type)
+                
+                # Save result to database
+                HealthCheck.objects.create(
+                    target=target,
+                    check_type=check_type,
+                    status=result['status'],
+                    response_time_ms=result.get('response_time_ms'),
+                    status_code=result.get('status_code'),
+                    response_body=result.get('response_body'),
+                    error_message=result.get('error_message'),
+                    cpu_usage_percent=result.get('cpu_usage_percent'),
+                    memory_usage_percent=result.get('memory_usage_percent'),
+                    disk_usage_percent=result.get('disk_usage_percent')
+                )
+                
+                # Update target status
+                target.last_checked = timezone.now()
+                target.last_status = result['status']
+                target.save(update_fields=['last_checked', 'last_status'])
+                
+                results[target.name] = result
+                
+            except Exception as e:
+                logger.error(f"Failed to execute health check for {target.name}: {e}")
+                results[target.name] = {
+                    'status': 'CRITICAL',
+                    'error_message': str(e)
+                }
+        
+        return results
+    
+    def _get_check_type_for_target(self, target) -> str:
+        """Determine the appropriate check type for a target"""
+        target_type_mapping = {
+            'APPLICATION': 'HTTP',
+            'DATABASE': 'DATABASE',
+            'CACHE': 'CACHE',
+            'QUEUE': 'QUEUE',
+            'EXTERNAL_API': 'HTTP',
+            'SERVICE': 'HTTP',
+            'INFRASTRUCTURE': 'HTTP',
+            'MODULE': 'CUSTOM',
+        }
+        
+        return target_type_mapping.get(target.target_type, 'HTTP')
+    
+    def get_system_health_summary(self) -> Dict[str, Any]:
+        """Get overall system health summary"""
+        from monitoring.models import HealthCheck, MonitoringTarget
+        
+        # Get latest health check for each target
+        latest_checks = HealthCheck.objects.filter(
+            target__status='ACTIVE'
+        ).order_by('target', '-checked_at').distinct('target')
+        
+        total_targets = MonitoringTarget.objects.filter(status='ACTIVE').count()
+        healthy_targets = latest_checks.filter(status='HEALTHY').count()
+        warning_targets = latest_checks.filter(status='WARNING').count()
+        critical_targets = latest_checks.filter(status='CRITICAL').count()
+        
+        # Calculate overall status
+        if critical_targets > 0:
+            overall_status = 'CRITICAL'
+        elif warning_targets > 0:
+            overall_status = 'WARNING'
+        elif healthy_targets == total_targets:
+            overall_status = 'HEALTHY'
+        else:
+            overall_status = 'UNKNOWN'
+        
+        return {
+            'overall_status': overall_status,
+            'total_targets': total_targets,
+            'healthy_targets': healthy_targets,
+            'warning_targets': warning_targets,
+            'critical_targets': critical_targets,
+            'health_percentage': (healthy_targets / total_targets * 100) if total_targets > 0 else 0,
+            'last_updated': timezone.now()
+        }
--- a/ETB-API/monitoring/services/metrics_collector.py
+++ b/ETB-API/monitoring/services/metrics_collector.py
@@ -0,0 +1,364 @@
+"""
+Metrics collection service for system monitoring
+"""
+import time
+import logging
+from typing import Dict, Any, List, Optional
+from datetime import datetime, timedelta
+from django.utils import timezone
+from django.db import connection
+from django.core.cache import cache
+from django.conf import settings
+from django.contrib.auth import get_user_model
+
+from monitoring.models import SystemMetric, MetricMeasurement
+
+User = get_user_model()
+logger = logging.getLogger(__name__)
+
+
+class MetricsCollector:
+    """Service for collecting and storing system metrics"""
+    
+    def __init__(self):
+        self.collected_metrics = {}
+    
+    def collect_all_metrics(self) -> Dict[str, Any]:
+        """Collect all configured metrics"""
+        results = {}
+        
+        # Get all active metrics
+        active_metrics = SystemMetric.objects.filter(is_active=True)
+        
+        for metric in active_metrics:
+            try:
+                value = self._collect_metric_value(metric)
+                if value is not None:
+                    # Store measurement
+                    measurement = MetricMeasurement.objects.create(
+                        metric=metric,
+                        value=value,
+                        tags=self._get_metric_tags(metric),
+                        metadata=self._get_metric_metadata(metric)
+                    )
+                    
+                    results[metric.name] = {
+                        'value': value,
+                        'measurement_id': measurement.id,
+                        'timestamp': measurement.timestamp
+                    }
+                    
+            except Exception as e:
+                logger.error(f"Failed to collect metric {metric.name}: {e}")
+                results[metric.name] = {
+                    'error': str(e)
+                }
+        
+        return results
+    
+    def _collect_metric_value(self, metric: SystemMetric) -> Optional[float]:
+        """Collect value for a specific metric"""
+        category = metric.category
+        
+        if category == 'API_RESPONSE_TIME':
+            return self._collect_api_response_time(metric)
+        elif category == 'THROUGHPUT':
+            return self._collect_throughput(metric)
+        elif category == 'ERROR_RATE':
+            return self._collect_error_rate(metric)
+        elif category == 'AVAILABILITY':
+            return self._collect_availability(metric)
+        elif category == 'INCIDENT_COUNT':
+            return self._collect_incident_count(metric)
+        elif category == 'MTTR':
+            return self._collect_mttr(metric)
+        elif category == 'MTTA':
+            return self._collect_mtta(metric)
+        elif category == 'SLA_COMPLIANCE':
+            return self._collect_sla_compliance(metric)
+        elif category == 'SECURITY_EVENTS':
+            return self._collect_security_events(metric)
+        elif category == 'AUTOMATION_SUCCESS':
+            return self._collect_automation_success(metric)
+        elif category == 'AI_ACCURACY':
+            return self._collect_ai_accuracy(metric)
+        elif category == 'COST_IMPACT':
+            return self._collect_cost_impact(metric)
+        elif category == 'USER_ACTIVITY':
+            return self._collect_user_activity(metric)
+        elif category == 'SYSTEM_RESOURCES':
+            return self._collect_system_resources(metric)
+        else:
+            logger.warning(f"Unknown metric category: {category}")
+            return None
+    
+    def _collect_api_response_time(self, metric: SystemMetric) -> Optional[float]:
+        """Collect API response time metrics"""
+        # This would typically come from middleware or APM tools
+        # For now, return a mock value
+        return 150.5  # milliseconds
+    
+    def _collect_throughput(self, metric: SystemMetric) -> Optional[float]:
+        """Collect throughput metrics (requests per minute)"""
+        # Count requests in the last minute
+        # This would typically come from access logs or middleware
+        return 120.0  # requests per minute
+    
+    def _collect_error_rate(self, metric: SystemMetric) -> Optional[float]:
+        """Collect error rate metrics"""
+        # Count errors in the last hour
+        # This would typically come from logs or error tracking
+        return 0.02  # 2% error rate
+    
+    def _collect_availability(self, metric: SystemMetric) -> Optional[float]:
+        """Collect availability metrics"""
+        # Calculate availability percentage
+        # This would typically come from uptime monitoring
+        return 99.9  # 99.9% availability
+    
+    def _collect_incident_count(self, metric: SystemMetric) -> Optional[float]:
+        """Collect incident count metrics"""
+        from incident_intelligence.models import Incident
+        
+        # Count incidents in the last 24 hours
+        since = timezone.now() - timedelta(hours=24)
+        count = Incident.objects.filter(created_at__gte=since).count()
+        return float(count)
+    
+    def _collect_mttr(self, metric: SystemMetric) -> Optional[float]:
+        """Collect Mean Time to Resolve metrics"""
+        from incident_intelligence.models import Incident
+        
+        # Calculate MTTR for resolved incidents in the last 7 days
+        since = timezone.now() - timedelta(days=7)
+        resolved_incidents = Incident.objects.filter(
+            status__in=['RESOLVED', 'CLOSED'],
+            resolved_at__isnull=False,
+            resolved_at__gte=since
+        )
+        
+        if not resolved_incidents.exists():
+            return None
+        
+        total_resolution_time = 0
+        count = 0
+        
+        for incident in resolved_incidents:
+            if incident.resolved_at and incident.created_at:
+                resolution_time = incident.resolved_at - incident.created_at
+                total_resolution_time += resolution_time.total_seconds()
+                count += 1
+        
+        if count > 0:
+            return total_resolution_time / count / 60  # Convert to minutes
+        return None
+    
+    def _collect_mtta(self, metric: SystemMetric) -> Optional[float]:
+        """Collect Mean Time to Acknowledge metrics"""
+        # This would require tracking when incidents are first acknowledged
+        # For now, return a mock value
+        return 15.5  # minutes
+    
+    def _collect_sla_compliance(self, metric: SystemMetric) -> Optional[float]:
+        """Collect SLA compliance metrics"""
+        from sla_oncall.models import SLAInstance
+        
+        # Calculate SLA compliance percentage
+        total_slas = SLAInstance.objects.count()
+        if total_slas == 0:
+            return None
+        
+        # This would require more complex SLA compliance calculation
+        # For now, return a mock value
+        return 95.5  # 95.5% SLA compliance
+    
+    def _collect_security_events(self, metric: SystemMetric) -> Optional[float]:
+        """Collect security events metrics"""
+        # Count security events in the last hour
+        # This would come from security logs or audit trails
+        return 3.0  # 3 security events in the last hour
+    
+    def _collect_automation_success(self, metric: SystemMetric) -> Optional[float]:
+        """Collect automation success rate metrics"""
+        from automation_orchestration.models import RunbookExecution
+        
+        # Calculate success rate for runbook executions in the last 24 hours
+        since = timezone.now() - timedelta(hours=24)
+        executions = RunbookExecution.objects.filter(created_at__gte=since)
+        
+        if not executions.exists():
+            return None
+        
+        successful = executions.filter(status='COMPLETED').count()
+        total = executions.count()
+        
+        return (successful / total * 100) if total > 0 else None
+    
+    def _collect_ai_accuracy(self, metric: SystemMetric) -> Optional[float]:
+        """Collect AI model accuracy metrics"""
+        from incident_intelligence.models import IncidentClassification
+        
+        # Calculate accuracy for AI classifications
+        classifications = IncidentClassification.objects.all()
+        
+        if not classifications.exists():
+            return None
+        
+        # This would require comparing predictions with actual outcomes
+        # For now, return average confidence score
+        total_confidence = sum(c.confidence_score for c in classifications)
+        return (total_confidence / classifications.count() * 100) if classifications.count() > 0 else None
+    
+    def _collect_cost_impact(self, metric: SystemMetric) -> Optional[float]:
+        """Collect cost impact metrics"""
+        from analytics_predictive_insights.models import CostImpactAnalysis
+        
+        # Calculate total cost impact for the last 30 days
+        since = timezone.now() - timedelta(days=30)
+        cost_analyses = CostImpactAnalysis.objects.filter(created_at__gte=since)
+        
+        total_cost = sum(float(ca.cost_amount) for ca in cost_analyses)
+        return total_cost
+    
+    def _collect_user_activity(self, metric: SystemMetric) -> Optional[float]:
+        """Collect user activity metrics"""
+        # Count active users in the last hour
+        since = timezone.now() - timedelta(hours=1)
+        # This would require user activity tracking
+        return 25.0  # 25 active users in the last hour
+    
+    def _collect_system_resources(self, metric: SystemMetric) -> Optional[float]:
+        """Collect system resource metrics"""
+        import psutil
+        
+        # Get CPU usage
+        cpu_percent = psutil.cpu_percent(interval=1)
+        return cpu_percent
+    
+    def _get_metric_tags(self, metric: SystemMetric) -> Dict[str, str]:
+        """Get tags for a metric measurement"""
+        tags = {
+            'metric_type': metric.metric_type,
+            'category': metric.category,
+        }
+        
+        if metric.related_module:
+            tags['module'] = metric.related_module
+        
+        return tags
+    
+    def _get_metric_metadata(self, metric: SystemMetric) -> Dict[str, Any]:
+        """Get metadata for a metric measurement"""
+        return {
+            'unit': metric.unit,
+            'aggregation_method': metric.aggregation_method,
+            'collection_interval': metric.collection_interval_seconds,
+        }
+
+
+class MetricsAggregator:
+    """Service for aggregating metrics over time periods"""
+    
+    def __init__(self):
+        self.collector = MetricsCollector()
+    
+    def aggregate_metrics(self, metric: SystemMetric, start_time: datetime, end_time: datetime) -> Dict[str, Any]:
+        """Aggregate metrics over a time period"""
+        measurements = MetricMeasurement.objects.filter(
+            metric=metric,
+            timestamp__gte=start_time,
+            timestamp__lte=end_time
+        ).order_by('timestamp')
+        
+        if not measurements.exists():
+            return {
+                'count': 0,
+                'values': [],
+                'aggregated_value': None
+            }
+        
+        values = [float(m.value) for m in measurements]
+        aggregated_value = self._aggregate_values(values, metric.aggregation_method)
+        
+        return {
+            'count': len(values),
+            'values': values,
+            'aggregated_value': aggregated_value,
+            'start_time': start_time,
+            'end_time': end_time,
+            'unit': metric.unit
+        }
+    
+    def _aggregate_values(self, values: List[float], method: str) -> Optional[float]:
+        """Aggregate a list of values using the specified method"""
+        if not values:
+            return None
+        
+        if method == 'AVERAGE':
+            return sum(values) / len(values)
+        elif method == 'SUM':
+            return sum(values)
+        elif method == 'COUNT':
+            return len(values)
+        elif method == 'MIN':
+            return min(values)
+        elif method == 'MAX':
+            return max(values)
+        elif method == 'PERCENTILE_95':
+            return self._calculate_percentile(values, 95)
+        elif method == 'PERCENTILE_99':
+            return self._calculate_percentile(values, 99)
+        else:
+            return sum(values) / len(values)  # Default to average
+    
+    def _calculate_percentile(self, values: List[float], percentile: int) -> float:
+        """Calculate percentile of values"""
+        sorted_values = sorted(values)
+        index = int((percentile / 100) * len(sorted_values))
+        return sorted_values[min(index, len(sorted_values) - 1)]
+    
+    def get_metric_trends(self, metric: SystemMetric, days: int = 7) -> Dict[str, Any]:
+        """Get metric trends over a period"""
+        end_time = timezone.now()
+        start_time = end_time - timedelta(days=days)
+        
+        # Get daily aggregations
+        daily_data = []
+        for i in range(days):
+            day_start = start_time + timedelta(days=i)
+            day_end = day_start + timedelta(days=1)
+            
+            day_aggregation = self.aggregate_metrics(metric, day_start, day_end)
+            daily_data.append({
+                'date': day_start.date(),
+                'value': day_aggregation['aggregated_value'],
+                'count': day_aggregation['count']
+            })
+        
+        return {
+            'metric_name': metric.name,
+            'period_days': days,
+            'daily_data': daily_data,
+            'trend': self._calculate_trend([d['value'] for d in daily_data if d['value'] is not None])
+        }
+    
+    def _calculate_trend(self, values: List[float]) -> str:
+        """Calculate trend direction from values"""
+        if len(values) < 2:
+            return 'STABLE'
+        
+        # Simple linear trend calculation
+        first_half = values[:len(values)//2]
+        second_half = values[len(values)//2:]
+        
+        first_avg = sum(first_half) / len(first_half)
+        second_avg = sum(second_half) / len(second_half)
+        
+        change_percent = ((second_avg - first_avg) / first_avg) * 100 if first_avg != 0 else 0
+        
+        if change_percent > 5:
+            return 'INCREASING'
+        elif change_percent < -5:
+            return 'DECREASING'
+        else:
+            return 'STABLE'