ETB/ETB-API/analytics_predictive_insights/ml/anomaly_detection.py

"""
ML-based anomaly detection for incident management
Implements various anomaly detection algorithms for identifying unusual patterns
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional, Any
from datetime import datetime, timedelta
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from scipy import stats
import logging

from django.utils import timezone
from django.db.models import Q, Avg, Count, Sum
from incident_intelligence.models import Incident
from ..models import AnomalyDetection, PredictiveModel

logger = logging.getLogger(__name__)


class AnomalyDetector:
    """Base class for anomaly detection algorithms"""

    def __init__(self, model_config: Dict[str, Any] = None):
        self.model_config = model_config or {}
        self.scaler = StandardScaler()
        self.is_fitted = False

    def fit(self, data: pd.DataFrame) -> None:
        """Fit the anomaly detection model"""
        raise NotImplementedError

    def predict(self, data: pd.DataFrame) -> np.ndarray:
        """Predict anomalies in the data"""
        raise NotImplementedError

    def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
        """Get anomaly scores for the data"""
        raise NotImplementedError


class StatisticalAnomalyDetector(AnomalyDetector):
    """Statistical anomaly detection using z-score and IQR methods"""

    def __init__(self, model_config: Dict[str, Any] = None):
        super().__init__(model_config)
        self.z_threshold = self.model_config.get('z_threshold', 3.0)
        self.iqr_multiplier = self.model_config.get('iqr_multiplier', 1.5)
        self.stats_cache = {}

    def fit(self, data: pd.DataFrame) -> None:
        """Calculate statistical parameters for anomaly detection"""
        for column in data.columns:
            if data[column].dtype in ['int64', 'float64']:
                values = data[column].dropna()
                if len(values) > 0:
                    self.stats_cache[column] = {
                        'mean': values.mean(),
                        'std': values.std(),
                        'q1': values.quantile(0.25),
                        'q3': values.quantile(0.75),
                        'iqr': values.quantile(0.75) - values.quantile(0.25)
                    }
        self.is_fitted = True

    def predict(self, data: pd.DataFrame) -> np.ndarray:
        """Predict anomalies using statistical methods"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")

        anomaly_flags = np.zeros(len(data), dtype=bool)

        for column in data.columns:
            if column in self.stats_cache and data[column].dtype in ['int64', 'float64']:
                values = data[column].dropna()
                if len(values) > 0:
                    stats = self.stats_cache[column]

                    # Z-score method
                    z_scores = np.abs((values - stats['mean']) / stats['std'])
                    z_anomalies = z_scores > self.z_threshold

                    # IQR method
                    lower_bound = stats['q1'] - self.iqr_multiplier * stats['iqr']
                    upper_bound = stats['q3'] + self.iqr_multiplier * stats['iqr']
                    iqr_anomalies = (values < lower_bound) | (values > upper_bound)

                    # Combine both methods
                    column_anomalies = z_anomalies | iqr_anomalies
                    anomaly_flags[values.index] |= column_anomalies

        return anomaly_flags

    def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
        """Get anomaly scores based on z-scores"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")

        scores = np.zeros(len(data))

        for column in data.columns:
            if column in self.stats_cache and data[column].dtype in ['int64', 'float64']:
                values = data[column].dropna()
                if len(values) > 0:
                    stats = self.stats_cache[column]
                    z_scores = np.abs((values - stats['mean']) / stats['std'])
                    scores[values.index] += z_scores

        return scores


class IsolationForestAnomalyDetector(AnomalyDetector):
    """Isolation Forest anomaly detection"""

    def __init__(self, model_config: Dict[str, Any] = None):
        super().__init__(model_config)
        self.contamination = self.model_config.get('contamination', 0.1)
        self.n_estimators = self.model_config.get('n_estimators', 100)
        self.model = IsolationForest(
            contamination=self.contamination,
            n_estimators=self.n_estimators,
            random_state=42
        )

    def fit(self, data: pd.DataFrame) -> None:
        """Fit the Isolation Forest model"""
        # Select numeric columns only
        numeric_data = data.select_dtypes(include=[np.number])

        if numeric_data.empty:
            raise ValueError("No numeric columns found in data")

        # Handle missing values
        numeric_data = numeric_data.fillna(numeric_data.median())

        # Scale the data
        scaled_data = self.scaler.fit_transform(numeric_data)

        # Fit the model
        self.model.fit(scaled_data)
        self.is_fitted = True

    def predict(self, data: pd.DataFrame) -> np.ndarray:
        """Predict anomalies using Isolation Forest"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")

        # Select numeric columns only
        numeric_data = data.select_dtypes(include=[np.number])

        if numeric_data.empty:
            return np.zeros(len(data), dtype=bool)

        # Handle missing values
        numeric_data = numeric_data.fillna(numeric_data.median())

        # Scale the data
        scaled_data = self.scaler.transform(numeric_data)

        # Predict anomalies (-1 for anomalies, 1 for normal)
        predictions = self.model.predict(scaled_data)
        return predictions == -1

    def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
        """Get anomaly scores from Isolation Forest"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")

        # Select numeric columns only
        numeric_data = data.select_dtypes(include=[np.number])

        if numeric_data.empty:
            return np.zeros(len(data))

        # Handle missing values
        numeric_data = numeric_data.fillna(numeric_data.median())

        # Scale the data
        scaled_data = self.scaler.transform(numeric_data)

        # Get anomaly scores
        scores = self.model.decision_function(scaled_data)
        # Convert to positive scores (higher = more anomalous)
        return -scores


class TemporalAnomalyDetector(AnomalyDetector):
    """Temporal anomaly detection for time series data"""

    def __init__(self, model_config: Dict[str, Any] = None):
        super().__init__(model_config)
        self.window_size = self.model_config.get('window_size', 24)  # hours
        self.threshold_multiplier = self.model_config.get('threshold_multiplier', 2.0)
        self.temporal_stats = {}

    def fit(self, data: pd.DataFrame) -> None:
        """Calculate temporal statistics for anomaly detection"""
        if 'timestamp' not in data.columns:
            raise ValueError("Timestamp column is required for temporal anomaly detection")

        # Sort by timestamp
        data_sorted = data.sort_values('timestamp')

        # Calculate rolling statistics
        for column in data_sorted.columns:
            if column != 'timestamp' and data_sorted[column].dtype in ['int64', 'float64']:
                # Calculate rolling mean and std
                rolling_mean = data_sorted[column].rolling(window=self.window_size, min_periods=1).mean()
                rolling_std = data_sorted[column].rolling(window=self.window_size, min_periods=1).std()

                self.temporal_stats[column] = {
                    'rolling_mean': rolling_mean,
                    'rolling_std': rolling_std
                }

        self.is_fitted = True

    def predict(self, data: pd.DataFrame) -> np.ndarray:
        """Predict temporal anomalies"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")

        if 'timestamp' not in data.columns:
            return np.zeros(len(data), dtype=bool)

        # Sort by timestamp
        data_sorted = data.sort_values('timestamp')
        anomaly_flags = np.zeros(len(data_sorted), dtype=bool)

        for column in data_sorted.columns:
            if column in self.temporal_stats and column != 'timestamp':
                values = data_sorted[column]
                rolling_mean = self.temporal_stats[column]['rolling_mean']
                rolling_std = self.temporal_stats[column]['rolling_std']

                # Calculate z-scores based on rolling statistics
                z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8))
                column_anomalies = z_scores > self.threshold_multiplier

                anomaly_flags |= column_anomalies

        return anomaly_flags

    def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
        """Get temporal anomaly scores"""
        if not self.is_fitted:
            raise ValueError("Model must be fitted before prediction")

        if 'timestamp' not in data.columns:
            return np.zeros(len(data))

        # Sort by timestamp
        data_sorted = data.sort_values('timestamp')
        scores = np.zeros(len(data_sorted))

        for column in data_sorted.columns:
            if column in self.temporal_stats and column != 'timestamp':
                values = data_sorted[column]
                rolling_mean = self.temporal_stats[column]['rolling_mean']
                rolling_std = self.temporal_stats[column]['rolling_std']

                # Calculate z-scores based on rolling statistics
                z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8))
                scores += z_scores

        return scores


class AnomalyDetectionEngine:
    """Main engine for anomaly detection"""

    def __init__(self):
        self.detectors = {
            'statistical': StatisticalAnomalyDetector,
            'isolation_forest': IsolationForestAnomalyDetector,
            'temporal': TemporalAnomalyDetector
        }

    def create_detector(self, algorithm_type: str, model_config: Dict[str, Any] = None) -> AnomalyDetector:
        """Create an anomaly detector instance"""
        if algorithm_type not in self.detectors:
            raise ValueError(f"Unknown algorithm type: {algorithm_type}")

        return self.detectors[algorithm_type](model_config)

    def prepare_incident_data(self, time_window_hours: int = 24) -> pd.DataFrame:
        """Prepare incident data for anomaly detection"""
        end_time = timezone.now()
        start_time = end_time - timedelta(hours=time_window_hours)

        # Get incidents from the time window
        incidents = Incident.objects.filter(
            created_at__gte=start_time,
            created_at__lte=end_time
        ).values(
            'id', 'created_at', 'severity', 'category', 'subcategory',
            'affected_users', 'estimated_downtime', 'status'
        )

        if not incidents:
            return pd.DataFrame()

        # Convert to DataFrame
        df = pd.DataFrame(list(incidents))

        # Convert datetime to timestamp
        df['timestamp'] = pd.to_datetime(df['created_at']).astype('int64') // 10**9

        # Encode categorical variables
        severity_mapping = {'LOW': 1, 'MEDIUM': 2, 'HIGH': 3, 'CRITICAL': 4, 'EMERGENCY': 5}
        df['severity_encoded'] = df['severity'].map(severity_mapping).fillna(0)

        # Convert estimated_downtime to hours
        df['downtime_hours'] = df['estimated_downtime'].apply(
            lambda x: x.total_seconds() / 3600 if x else 0
        )

        # Create time-based features
        df['hour_of_day'] = pd.to_datetime(df['created_at']).dt.hour
        df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek

        return df

    def detect_anomalies(self, model: PredictiveModel, time_window_hours: int = 24) -> List[Dict[str, Any]]:
        """Detect anomalies using the specified model"""
        try:
            # Prepare data
            data = self.prepare_incident_data(time_window_hours)

            if data.empty:
                logger.warning("No incident data found for anomaly detection")
                return []

            # Create detector
            detector = self.create_detector(
                model.algorithm_type,
                model.model_config
            )

            # Fit the model
            detector.fit(data)

            # Predict anomalies
            anomaly_flags = detector.predict(data)
            anomaly_scores = detector.get_anomaly_scores(data)

            # Process results
            anomalies = []
            for idx, is_anomaly in enumerate(anomaly_flags):
                if is_anomaly:
                    incident_data = data.iloc[idx]
                    anomaly_data = {
                        'model': model,
                        'anomaly_type': self._determine_anomaly_type(model.algorithm_type),
                        'severity': self._determine_severity(anomaly_scores[idx]),
                        'confidence_score': min(1.0, max(0.0, anomaly_scores[idx] / 10.0)),
                        'anomaly_score': float(anomaly_scores[idx]),
                        'threshold_used': self._get_threshold(model.algorithm_type, model.model_config),
                        'time_window_start': timezone.now() - timedelta(hours=time_window_hours),
                        'time_window_end': timezone.now(),
                        'description': self._generate_description(incident_data, anomaly_scores[idx]),
                        'affected_services': [incident_data.get('category', 'Unknown')],
                        'affected_metrics': ['incident_frequency', 'severity_distribution'],
                        'metadata': {
                            'incident_id': str(incident_data['id']),
                            'detection_algorithm': model.algorithm_type,
                            'time_window_hours': time_window_hours
                        }
                    }
                    anomalies.append(anomaly_data)

            return anomalies

        except Exception as e:
            logger.error(f"Error in anomaly detection: {str(e)}")
            return []

    def _determine_anomaly_type(self, algorithm_type: str) -> str:
        """Determine anomaly type based on algorithm"""
        mapping = {
            'statistical': 'STATISTICAL',
            'isolation_forest': 'PATTERN',
            'temporal': 'TEMPORAL'
        }
        return mapping.get(algorithm_type, 'STATISTICAL')

    def _determine_severity(self, anomaly_score: float) -> str:
        """Determine severity based on anomaly score"""
        if anomaly_score >= 5.0:
            return 'CRITICAL'
        elif anomaly_score >= 3.0:
            return 'HIGH'
        elif anomaly_score >= 2.0:
            return 'MEDIUM'
        else:
            return 'LOW'

    def _get_threshold(self, algorithm_type: str, model_config: Dict[str, Any]) -> float:
        """Get threshold used for anomaly detection"""
        if algorithm_type == 'statistical':
            return model_config.get('z_threshold', 3.0)
        elif algorithm_type == 'isolation_forest':
            return model_config.get('contamination', 0.1)
        elif algorithm_type == 'temporal':
            return model_config.get('threshold_multiplier', 2.0)
        return 1.0

    def _generate_description(self, incident_data: pd.Series, anomaly_score: float) -> str:
        """Generate description for the anomaly"""
        severity = incident_data.get('severity', 'Unknown')
        category = incident_data.get('category', 'Unknown')
        affected_users = incident_data.get('affected_users', 0)

        return f"Anomalous incident detected: {severity} severity incident in {category} category affecting {affected_users} users. Anomaly score: {anomaly_score:.2f}"


class AnomalyDetectionService:
    """Service for managing anomaly detection"""

    def __init__(self):
        self.engine = AnomalyDetectionEngine()

    def run_anomaly_detection(self, model_id: str = None) -> int:
        """Run anomaly detection for all active models or a specific model"""
        if model_id:
            models = PredictiveModel.objects.filter(
                id=model_id,
                model_type='ANOMALY_DETECTION',
                status='ACTIVE'
            )
        else:
            models = PredictiveModel.objects.filter(
                model_type='ANOMALY_DETECTION',
                status='ACTIVE'
            )

        total_anomalies = 0

        for model in models:
            try:
                # Detect anomalies
                anomalies = self.engine.detect_anomalies(model)

                # Save anomalies to database
                for anomaly_data in anomalies:
                    AnomalyDetection.objects.create(**anomaly_data)
                    total_anomalies += 1

                logger.info(f"Detected {len(anomalies)} anomalies using model {model.name}")

            except Exception as e:
                logger.error(f"Error running anomaly detection for model {model.name}: {str(e)}")

        return total_anomalies

    def get_anomaly_summary(self, time_window_hours: int = 24) -> Dict[str, Any]:
        """Get summary of recent anomalies"""
        end_time = timezone.now()
        start_time = end_time - timedelta(hours=time_window_hours)

        anomalies = AnomalyDetection.objects.filter(
            detected_at__gte=start_time,
            detected_at__lte=end_time
        )

        return {
            'total_anomalies': anomalies.count(),
            'critical_anomalies': anomalies.filter(severity='CRITICAL').count(),
            'high_anomalies': anomalies.filter(severity='HIGH').count(),
            'medium_anomalies': anomalies.filter(severity='MEDIUM').count(),
            'low_anomalies': anomalies.filter(severity='LOW').count(),
            'unresolved_anomalies': anomalies.filter(
                status__in=['DETECTED', 'INVESTIGATING']
            ).count(),
            'false_positive_rate': self._calculate_false_positive_rate(anomalies),
            'average_confidence': anomalies.aggregate(
                avg=Avg('confidence_score')
            )['avg'] or 0.0
        }

    def _calculate_false_positive_rate(self, anomalies) -> float:
        """Calculate false positive rate"""
        total_anomalies = anomalies.count()
        if total_anomalies == 0:
            return 0.0

        false_positives = anomalies.filter(status='FALSE_POSITIVE').count()
        return (false_positives / total_anomalies) * 100