ETB/ETB-API/monitoring/services/metrics_collector.py

"""
Metrics collection service for system monitoring
"""
import time
import logging
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from django.utils import timezone
from django.db import connection
from django.core.cache import cache
from django.conf import settings
from django.contrib.auth import get_user_model

from monitoring.models import SystemMetric, MetricMeasurement

User = get_user_model()
logger = logging.getLogger(__name__)


class MetricsCollector:
    """Service for collecting and storing system metrics"""

    def __init__(self):
        self.collected_metrics = {}

    def collect_all_metrics(self) -> Dict[str, Any]:
        """Collect all configured metrics"""
        results = {}

        # Get all active metrics
        active_metrics = SystemMetric.objects.filter(is_active=True)

        for metric in active_metrics:
            try:
                value = self._collect_metric_value(metric)
                if value is not None:
                    # Store measurement
                    measurement = MetricMeasurement.objects.create(
                        metric=metric,
                        value=value,
                        tags=self._get_metric_tags(metric),
                        metadata=self._get_metric_metadata(metric)
                    )

                    results[metric.name] = {
                        'value': value,
                        'measurement_id': measurement.id,
                        'timestamp': measurement.timestamp
                    }

            except Exception as e:
                logger.error(f"Failed to collect metric {metric.name}: {e}")
                results[metric.name] = {
                    'error': str(e)
                }

        return results

    def _collect_metric_value(self, metric: SystemMetric) -> Optional[float]:
        """Collect value for a specific metric"""
        category = metric.category

        if category == 'API_RESPONSE_TIME':
            return self._collect_api_response_time(metric)
        elif category == 'THROUGHPUT':
            return self._collect_throughput(metric)
        elif category == 'ERROR_RATE':
            return self._collect_error_rate(metric)
        elif category == 'AVAILABILITY':
            return self._collect_availability(metric)
        elif category == 'INCIDENT_COUNT':
            return self._collect_incident_count(metric)
        elif category == 'MTTR':
            return self._collect_mttr(metric)
        elif category == 'MTTA':
            return self._collect_mtta(metric)
        elif category == 'SLA_COMPLIANCE':
            return self._collect_sla_compliance(metric)
        elif category == 'SECURITY_EVENTS':
            return self._collect_security_events(metric)
        elif category == 'AUTOMATION_SUCCESS':
            return self._collect_automation_success(metric)
        elif category == 'AI_ACCURACY':
            return self._collect_ai_accuracy(metric)
        elif category == 'COST_IMPACT':
            return self._collect_cost_impact(metric)
        elif category == 'USER_ACTIVITY':
            return self._collect_user_activity(metric)
        elif category == 'SYSTEM_RESOURCES':
            return self._collect_system_resources(metric)
        else:
            logger.warning(f"Unknown metric category: {category}")
            return None

    def _collect_api_response_time(self, metric: SystemMetric) -> Optional[float]:
        """Collect API response time metrics"""
        # This would typically come from middleware or APM tools
        # For now, return a mock value
        return 150.5  # milliseconds

    def _collect_throughput(self, metric: SystemMetric) -> Optional[float]:
        """Collect throughput metrics (requests per minute)"""
        # Count requests in the last minute
        # This would typically come from access logs or middleware
        return 120.0  # requests per minute

    def _collect_error_rate(self, metric: SystemMetric) -> Optional[float]:
        """Collect error rate metrics"""
        # Count errors in the last hour
        # This would typically come from logs or error tracking
        return 0.02  # 2% error rate

    def _collect_availability(self, metric: SystemMetric) -> Optional[float]:
        """Collect availability metrics"""
        # Calculate availability percentage
        # This would typically come from uptime monitoring
        return 99.9  # 99.9% availability

    def _collect_incident_count(self, metric: SystemMetric) -> Optional[float]:
        """Collect incident count metrics"""
        from incident_intelligence.models import Incident

        # Count incidents in the last 24 hours
        since = timezone.now() - timedelta(hours=24)
        count = Incident.objects.filter(created_at__gte=since).count()
        return float(count)

    def _collect_mttr(self, metric: SystemMetric) -> Optional[float]:
        """Collect Mean Time to Resolve metrics"""
        from incident_intelligence.models import Incident

        # Calculate MTTR for resolved incidents in the last 7 days
        since = timezone.now() - timedelta(days=7)
        resolved_incidents = Incident.objects.filter(
            status__in=['RESOLVED', 'CLOSED'],
            resolved_at__isnull=False,
            resolved_at__gte=since
        )

        if not resolved_incidents.exists():
            return None

        total_resolution_time = 0
        count = 0

        for incident in resolved_incidents:
            if incident.resolved_at and incident.created_at:
                resolution_time = incident.resolved_at - incident.created_at
                total_resolution_time += resolution_time.total_seconds()
                count += 1

        if count > 0:
            return total_resolution_time / count / 60  # Convert to minutes
        return None

    def _collect_mtta(self, metric: SystemMetric) -> Optional[float]:
        """Collect Mean Time to Acknowledge metrics"""
        # This would require tracking when incidents are first acknowledged
        # For now, return a mock value
        return 15.5  # minutes

    def _collect_sla_compliance(self, metric: SystemMetric) -> Optional[float]:
        """Collect SLA compliance metrics"""
        from sla_oncall.models import SLAInstance

        # Calculate SLA compliance percentage
        total_slas = SLAInstance.objects.count()
        if total_slas == 0:
            return None

        # This would require more complex SLA compliance calculation
        # For now, return a mock value
        return 95.5  # 95.5% SLA compliance

    def _collect_security_events(self, metric: SystemMetric) -> Optional[float]:
        """Collect security events metrics"""
        # Count security events in the last hour
        # This would come from security logs or audit trails
        return 3.0  # 3 security events in the last hour

    def _collect_automation_success(self, metric: SystemMetric) -> Optional[float]:
        """Collect automation success rate metrics"""
        from automation_orchestration.models import RunbookExecution

        # Calculate success rate for runbook executions in the last 24 hours
        since = timezone.now() - timedelta(hours=24)
        executions = RunbookExecution.objects.filter(created_at__gte=since)

        if not executions.exists():
            return None

        successful = executions.filter(status='COMPLETED').count()
        total = executions.count()

        return (successful / total * 100) if total > 0 else None

    def _collect_ai_accuracy(self, metric: SystemMetric) -> Optional[float]:
        """Collect AI model accuracy metrics"""
        from incident_intelligence.models import IncidentClassification

        # Calculate accuracy for AI classifications
        classifications = IncidentClassification.objects.all()

        if not classifications.exists():
            return None

        # This would require comparing predictions with actual outcomes
        # For now, return average confidence score
        total_confidence = sum(c.confidence_score for c in classifications)
        return (total_confidence / classifications.count() * 100) if classifications.count() > 0 else None

    def _collect_cost_impact(self, metric: SystemMetric) -> Optional[float]:
        """Collect cost impact metrics"""
        from analytics_predictive_insights.models import CostImpactAnalysis

        # Calculate total cost impact for the last 30 days
        since = timezone.now() - timedelta(days=30)
        cost_analyses = CostImpactAnalysis.objects.filter(created_at__gte=since)

        total_cost = sum(float(ca.cost_amount) for ca in cost_analyses)
        return total_cost

    def _collect_user_activity(self, metric: SystemMetric) -> Optional[float]:
        """Collect user activity metrics"""
        # Count active users in the last hour
        since = timezone.now() - timedelta(hours=1)
        # This would require user activity tracking
        return 25.0  # 25 active users in the last hour

    def _collect_system_resources(self, metric: SystemMetric) -> Optional[float]:
        """Collect system resource metrics"""
        import psutil

        # Get CPU usage
        cpu_percent = psutil.cpu_percent(interval=1)
        return cpu_percent

    def _get_metric_tags(self, metric: SystemMetric) -> Dict[str, str]:
        """Get tags for a metric measurement"""
        tags = {
            'metric_type': metric.metric_type,
            'category': metric.category,
        }

        if metric.related_module:
            tags['module'] = metric.related_module

        return tags

    def _get_metric_metadata(self, metric: SystemMetric) -> Dict[str, Any]:
        """Get metadata for a metric measurement"""
        return {
            'unit': metric.unit,
            'aggregation_method': metric.aggregation_method,
            'collection_interval': metric.collection_interval_seconds,
        }


class MetricsAggregator:
    """Service for aggregating metrics over time periods"""

    def __init__(self):
        self.collector = MetricsCollector()

    def aggregate_metrics(self, metric: SystemMetric, start_time: datetime, end_time: datetime) -> Dict[str, Any]:
        """Aggregate metrics over a time period"""
        measurements = MetricMeasurement.objects.filter(
            metric=metric,
            timestamp__gte=start_time,
            timestamp__lte=end_time
        ).order_by('timestamp')

        if not measurements.exists():
            return {
                'count': 0,
                'values': [],
                'aggregated_value': None
            }

        values = [float(m.value) for m in measurements]
        aggregated_value = self._aggregate_values(values, metric.aggregation_method)

        return {
            'count': len(values),
            'values': values,
            'aggregated_value': aggregated_value,
            'start_time': start_time,
            'end_time': end_time,
            'unit': metric.unit
        }

    def _aggregate_values(self, values: List[float], method: str) -> Optional[float]:
        """Aggregate a list of values using the specified method"""
        if not values:
            return None

        if method == 'AVERAGE':
            return sum(values) / len(values)
        elif method == 'SUM':
            return sum(values)
        elif method == 'COUNT':
            return len(values)
        elif method == 'MIN':
            return min(values)
        elif method == 'MAX':
            return max(values)
        elif method == 'PERCENTILE_95':
            return self._calculate_percentile(values, 95)
        elif method == 'PERCENTILE_99':
            return self._calculate_percentile(values, 99)
        else:
            return sum(values) / len(values)  # Default to average

    def _calculate_percentile(self, values: List[float], percentile: int) -> float:
        """Calculate percentile of values"""
        sorted_values = sorted(values)
        index = int((percentile / 100) * len(sorted_values))
        return sorted_values[min(index, len(sorted_values) - 1)]

    def get_metric_trends(self, metric: SystemMetric, days: int = 7) -> Dict[str, Any]:
        """Get metric trends over a period"""
        end_time = timezone.now()
        start_time = end_time - timedelta(days=days)

        # Get daily aggregations
        daily_data = []
        for i in range(days):
            day_start = start_time + timedelta(days=i)
            day_end = day_start + timedelta(days=1)

            day_aggregation = self.aggregate_metrics(metric, day_start, day_end)
            daily_data.append({
                'date': day_start.date(),
                'value': day_aggregation['aggregated_value'],
                'count': day_aggregation['count']
            })

        return {
            'metric_name': metric.name,
            'period_days': days,
            'daily_data': daily_data,
            'trend': self._calculate_trend([d['value'] for d in daily_data if d['value'] is not None])
        }

    def _calculate_trend(self, values: List[float]) -> str:
        """Calculate trend direction from values"""
        if len(values) < 2:
            return 'STABLE'

        # Simple linear trend calculation
        first_half = values[:len(values)//2]
        second_half = values[len(values)//2:]

        first_avg = sum(first_half) / len(first_half)
        second_avg = sum(second_half) / len(second_half)

        change_percent = ((second_avg - first_avg) / first_avg) * 100 if first_avg != 0 else 0

        if change_percent > 5:
            return 'INCREASING'
        elif change_percent < -5:
            return 'DECREASING'
        else:
            return 'STABLE'