""" ML-based anomaly detection for incident management Implements various anomaly detection algorithms for identifying unusual patterns """ import numpy as np import pandas as pd from typing import Dict, List, Tuple, Optional, Any from datetime import datetime, timedelta from sklearn.ensemble import IsolationForest from sklearn.preprocessing import StandardScaler from sklearn.cluster import DBSCAN from sklearn.decomposition import PCA from scipy import stats import logging from django.utils import timezone from django.db.models import Q, Avg, Count, Sum from incident_intelligence.models import Incident from ..models import AnomalyDetection, PredictiveModel logger = logging.getLogger(__name__) class AnomalyDetector: """Base class for anomaly detection algorithms""" def __init__(self, model_config: Dict[str, Any] = None): self.model_config = model_config or {} self.scaler = StandardScaler() self.is_fitted = False def fit(self, data: pd.DataFrame) -> None: """Fit the anomaly detection model""" raise NotImplementedError def predict(self, data: pd.DataFrame) -> np.ndarray: """Predict anomalies in the data""" raise NotImplementedError def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray: """Get anomaly scores for the data""" raise NotImplementedError class StatisticalAnomalyDetector(AnomalyDetector): """Statistical anomaly detection using z-score and IQR methods""" def __init__(self, model_config: Dict[str, Any] = None): super().__init__(model_config) self.z_threshold = self.model_config.get('z_threshold', 3.0) self.iqr_multiplier = self.model_config.get('iqr_multiplier', 1.5) self.stats_cache = {} def fit(self, data: pd.DataFrame) -> None: """Calculate statistical parameters for anomaly detection""" for column in data.columns: if data[column].dtype in ['int64', 'float64']: values = data[column].dropna() if len(values) > 0: self.stats_cache[column] = { 'mean': values.mean(), 'std': values.std(), 'q1': values.quantile(0.25), 'q3': values.quantile(0.75), 'iqr': values.quantile(0.75) - values.quantile(0.25) } self.is_fitted = True def predict(self, data: pd.DataFrame) -> np.ndarray: """Predict anomalies using statistical methods""" if not self.is_fitted: raise ValueError("Model must be fitted before prediction") anomaly_flags = np.zeros(len(data), dtype=bool) for column in data.columns: if column in self.stats_cache and data[column].dtype in ['int64', 'float64']: values = data[column].dropna() if len(values) > 0: stats = self.stats_cache[column] # Z-score method z_scores = np.abs((values - stats['mean']) / stats['std']) z_anomalies = z_scores > self.z_threshold # IQR method lower_bound = stats['q1'] - self.iqr_multiplier * stats['iqr'] upper_bound = stats['q3'] + self.iqr_multiplier * stats['iqr'] iqr_anomalies = (values < lower_bound) | (values > upper_bound) # Combine both methods column_anomalies = z_anomalies | iqr_anomalies anomaly_flags[values.index] |= column_anomalies return anomaly_flags def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray: """Get anomaly scores based on z-scores""" if not self.is_fitted: raise ValueError("Model must be fitted before prediction") scores = np.zeros(len(data)) for column in data.columns: if column in self.stats_cache and data[column].dtype in ['int64', 'float64']: values = data[column].dropna() if len(values) > 0: stats = self.stats_cache[column] z_scores = np.abs((values - stats['mean']) / stats['std']) scores[values.index] += z_scores return scores class IsolationForestAnomalyDetector(AnomalyDetector): """Isolation Forest anomaly detection""" def __init__(self, model_config: Dict[str, Any] = None): super().__init__(model_config) self.contamination = self.model_config.get('contamination', 0.1) self.n_estimators = self.model_config.get('n_estimators', 100) self.model = IsolationForest( contamination=self.contamination, n_estimators=self.n_estimators, random_state=42 ) def fit(self, data: pd.DataFrame) -> None: """Fit the Isolation Forest model""" # Select numeric columns only numeric_data = data.select_dtypes(include=[np.number]) if numeric_data.empty: raise ValueError("No numeric columns found in data") # Handle missing values numeric_data = numeric_data.fillna(numeric_data.median()) # Scale the data scaled_data = self.scaler.fit_transform(numeric_data) # Fit the model self.model.fit(scaled_data) self.is_fitted = True def predict(self, data: pd.DataFrame) -> np.ndarray: """Predict anomalies using Isolation Forest""" if not self.is_fitted: raise ValueError("Model must be fitted before prediction") # Select numeric columns only numeric_data = data.select_dtypes(include=[np.number]) if numeric_data.empty: return np.zeros(len(data), dtype=bool) # Handle missing values numeric_data = numeric_data.fillna(numeric_data.median()) # Scale the data scaled_data = self.scaler.transform(numeric_data) # Predict anomalies (-1 for anomalies, 1 for normal) predictions = self.model.predict(scaled_data) return predictions == -1 def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray: """Get anomaly scores from Isolation Forest""" if not self.is_fitted: raise ValueError("Model must be fitted before prediction") # Select numeric columns only numeric_data = data.select_dtypes(include=[np.number]) if numeric_data.empty: return np.zeros(len(data)) # Handle missing values numeric_data = numeric_data.fillna(numeric_data.median()) # Scale the data scaled_data = self.scaler.transform(numeric_data) # Get anomaly scores scores = self.model.decision_function(scaled_data) # Convert to positive scores (higher = more anomalous) return -scores class TemporalAnomalyDetector(AnomalyDetector): """Temporal anomaly detection for time series data""" def __init__(self, model_config: Dict[str, Any] = None): super().__init__(model_config) self.window_size = self.model_config.get('window_size', 24) # hours self.threshold_multiplier = self.model_config.get('threshold_multiplier', 2.0) self.temporal_stats = {} def fit(self, data: pd.DataFrame) -> None: """Calculate temporal statistics for anomaly detection""" if 'timestamp' not in data.columns: raise ValueError("Timestamp column is required for temporal anomaly detection") # Sort by timestamp data_sorted = data.sort_values('timestamp') # Calculate rolling statistics for column in data_sorted.columns: if column != 'timestamp' and data_sorted[column].dtype in ['int64', 'float64']: # Calculate rolling mean and std rolling_mean = data_sorted[column].rolling(window=self.window_size, min_periods=1).mean() rolling_std = data_sorted[column].rolling(window=self.window_size, min_periods=1).std() self.temporal_stats[column] = { 'rolling_mean': rolling_mean, 'rolling_std': rolling_std } self.is_fitted = True def predict(self, data: pd.DataFrame) -> np.ndarray: """Predict temporal anomalies""" if not self.is_fitted: raise ValueError("Model must be fitted before prediction") if 'timestamp' not in data.columns: return np.zeros(len(data), dtype=bool) # Sort by timestamp data_sorted = data.sort_values('timestamp') anomaly_flags = np.zeros(len(data_sorted), dtype=bool) for column in data_sorted.columns: if column in self.temporal_stats and column != 'timestamp': values = data_sorted[column] rolling_mean = self.temporal_stats[column]['rolling_mean'] rolling_std = self.temporal_stats[column]['rolling_std'] # Calculate z-scores based on rolling statistics z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8)) column_anomalies = z_scores > self.threshold_multiplier anomaly_flags |= column_anomalies return anomaly_flags def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray: """Get temporal anomaly scores""" if not self.is_fitted: raise ValueError("Model must be fitted before prediction") if 'timestamp' not in data.columns: return np.zeros(len(data)) # Sort by timestamp data_sorted = data.sort_values('timestamp') scores = np.zeros(len(data_sorted)) for column in data_sorted.columns: if column in self.temporal_stats and column != 'timestamp': values = data_sorted[column] rolling_mean = self.temporal_stats[column]['rolling_mean'] rolling_std = self.temporal_stats[column]['rolling_std'] # Calculate z-scores based on rolling statistics z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8)) scores += z_scores return scores class AnomalyDetectionEngine: """Main engine for anomaly detection""" def __init__(self): self.detectors = { 'statistical': StatisticalAnomalyDetector, 'isolation_forest': IsolationForestAnomalyDetector, 'temporal': TemporalAnomalyDetector } def create_detector(self, algorithm_type: str, model_config: Dict[str, Any] = None) -> AnomalyDetector: """Create an anomaly detector instance""" if algorithm_type not in self.detectors: raise ValueError(f"Unknown algorithm type: {algorithm_type}") return self.detectors[algorithm_type](model_config) def prepare_incident_data(self, time_window_hours: int = 24) -> pd.DataFrame: """Prepare incident data for anomaly detection""" end_time = timezone.now() start_time = end_time - timedelta(hours=time_window_hours) # Get incidents from the time window incidents = Incident.objects.filter( created_at__gte=start_time, created_at__lte=end_time ).values( 'id', 'created_at', 'severity', 'category', 'subcategory', 'affected_users', 'estimated_downtime', 'status' ) if not incidents: return pd.DataFrame() # Convert to DataFrame df = pd.DataFrame(list(incidents)) # Convert datetime to timestamp df['timestamp'] = pd.to_datetime(df['created_at']).astype('int64') // 10**9 # Encode categorical variables severity_mapping = {'LOW': 1, 'MEDIUM': 2, 'HIGH': 3, 'CRITICAL': 4, 'EMERGENCY': 5} df['severity_encoded'] = df['severity'].map(severity_mapping).fillna(0) # Convert estimated_downtime to hours df['downtime_hours'] = df['estimated_downtime'].apply( lambda x: x.total_seconds() / 3600 if x else 0 ) # Create time-based features df['hour_of_day'] = pd.to_datetime(df['created_at']).dt.hour df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek return df def detect_anomalies(self, model: PredictiveModel, time_window_hours: int = 24) -> List[Dict[str, Any]]: """Detect anomalies using the specified model""" try: # Prepare data data = self.prepare_incident_data(time_window_hours) if data.empty: logger.warning("No incident data found for anomaly detection") return [] # Create detector detector = self.create_detector( model.algorithm_type, model.model_config ) # Fit the model detector.fit(data) # Predict anomalies anomaly_flags = detector.predict(data) anomaly_scores = detector.get_anomaly_scores(data) # Process results anomalies = [] for idx, is_anomaly in enumerate(anomaly_flags): if is_anomaly: incident_data = data.iloc[idx] anomaly_data = { 'model': model, 'anomaly_type': self._determine_anomaly_type(model.algorithm_type), 'severity': self._determine_severity(anomaly_scores[idx]), 'confidence_score': min(1.0, max(0.0, anomaly_scores[idx] / 10.0)), 'anomaly_score': float(anomaly_scores[idx]), 'threshold_used': self._get_threshold(model.algorithm_type, model.model_config), 'time_window_start': timezone.now() - timedelta(hours=time_window_hours), 'time_window_end': timezone.now(), 'description': self._generate_description(incident_data, anomaly_scores[idx]), 'affected_services': [incident_data.get('category', 'Unknown')], 'affected_metrics': ['incident_frequency', 'severity_distribution'], 'metadata': { 'incident_id': str(incident_data['id']), 'detection_algorithm': model.algorithm_type, 'time_window_hours': time_window_hours } } anomalies.append(anomaly_data) return anomalies except Exception as e: logger.error(f"Error in anomaly detection: {str(e)}") return [] def _determine_anomaly_type(self, algorithm_type: str) -> str: """Determine anomaly type based on algorithm""" mapping = { 'statistical': 'STATISTICAL', 'isolation_forest': 'PATTERN', 'temporal': 'TEMPORAL' } return mapping.get(algorithm_type, 'STATISTICAL') def _determine_severity(self, anomaly_score: float) -> str: """Determine severity based on anomaly score""" if anomaly_score >= 5.0: return 'CRITICAL' elif anomaly_score >= 3.0: return 'HIGH' elif anomaly_score >= 2.0: return 'MEDIUM' else: return 'LOW' def _get_threshold(self, algorithm_type: str, model_config: Dict[str, Any]) -> float: """Get threshold used for anomaly detection""" if algorithm_type == 'statistical': return model_config.get('z_threshold', 3.0) elif algorithm_type == 'isolation_forest': return model_config.get('contamination', 0.1) elif algorithm_type == 'temporal': return model_config.get('threshold_multiplier', 2.0) return 1.0 def _generate_description(self, incident_data: pd.Series, anomaly_score: float) -> str: """Generate description for the anomaly""" severity = incident_data.get('severity', 'Unknown') category = incident_data.get('category', 'Unknown') affected_users = incident_data.get('affected_users', 0) return f"Anomalous incident detected: {severity} severity incident in {category} category affecting {affected_users} users. Anomaly score: {anomaly_score:.2f}" class AnomalyDetectionService: """Service for managing anomaly detection""" def __init__(self): self.engine = AnomalyDetectionEngine() def run_anomaly_detection(self, model_id: str = None) -> int: """Run anomaly detection for all active models or a specific model""" if model_id: models = PredictiveModel.objects.filter( id=model_id, model_type='ANOMALY_DETECTION', status='ACTIVE' ) else: models = PredictiveModel.objects.filter( model_type='ANOMALY_DETECTION', status='ACTIVE' ) total_anomalies = 0 for model in models: try: # Detect anomalies anomalies = self.engine.detect_anomalies(model) # Save anomalies to database for anomaly_data in anomalies: AnomalyDetection.objects.create(**anomaly_data) total_anomalies += 1 logger.info(f"Detected {len(anomalies)} anomalies using model {model.name}") except Exception as e: logger.error(f"Error running anomaly detection for model {model.name}: {str(e)}") return total_anomalies def get_anomaly_summary(self, time_window_hours: int = 24) -> Dict[str, Any]: """Get summary of recent anomalies""" end_time = timezone.now() start_time = end_time - timedelta(hours=time_window_hours) anomalies = AnomalyDetection.objects.filter( detected_at__gte=start_time, detected_at__lte=end_time ) return { 'total_anomalies': anomalies.count(), 'critical_anomalies': anomalies.filter(severity='CRITICAL').count(), 'high_anomalies': anomalies.filter(severity='HIGH').count(), 'medium_anomalies': anomalies.filter(severity='MEDIUM').count(), 'low_anomalies': anomalies.filter(severity='LOW').count(), 'unresolved_anomalies': anomalies.filter( status__in=['DETECTED', 'INVESTIGATING'] ).count(), 'false_positive_rate': self._calculate_false_positive_rate(anomalies), 'average_confidence': anomalies.aggregate( avg=Avg('confidence_score') )['avg'] or 0.0 } def _calculate_false_positive_rate(self, anomalies) -> float: """Calculate false positive rate""" total_anomalies = anomalies.count() if total_anomalies == 0: return 0.0 false_positives = anomalies.filter(status='FALSE_POSITIVE').count() return (false_positives / total_anomalies) * 100