492 lines
19 KiB
Python
492 lines
19 KiB
Python
"""
|
|
ML-based anomaly detection for incident management
|
|
Implements various anomaly detection algorithms for identifying unusual patterns
|
|
"""
|
|
import numpy as np
|
|
import pandas as pd
|
|
from typing import Dict, List, Tuple, Optional, Any
|
|
from datetime import datetime, timedelta
|
|
from sklearn.ensemble import IsolationForest
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.cluster import DBSCAN
|
|
from sklearn.decomposition import PCA
|
|
from scipy import stats
|
|
import logging
|
|
|
|
from django.utils import timezone
|
|
from django.db.models import Q, Avg, Count, Sum
|
|
from incident_intelligence.models import Incident
|
|
from ..models import AnomalyDetection, PredictiveModel
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class AnomalyDetector:
|
|
"""Base class for anomaly detection algorithms"""
|
|
|
|
def __init__(self, model_config: Dict[str, Any] = None):
|
|
self.model_config = model_config or {}
|
|
self.scaler = StandardScaler()
|
|
self.is_fitted = False
|
|
|
|
def fit(self, data: pd.DataFrame) -> None:
|
|
"""Fit the anomaly detection model"""
|
|
raise NotImplementedError
|
|
|
|
def predict(self, data: pd.DataFrame) -> np.ndarray:
|
|
"""Predict anomalies in the data"""
|
|
raise NotImplementedError
|
|
|
|
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
|
|
"""Get anomaly scores for the data"""
|
|
raise NotImplementedError
|
|
|
|
|
|
class StatisticalAnomalyDetector(AnomalyDetector):
|
|
"""Statistical anomaly detection using z-score and IQR methods"""
|
|
|
|
def __init__(self, model_config: Dict[str, Any] = None):
|
|
super().__init__(model_config)
|
|
self.z_threshold = self.model_config.get('z_threshold', 3.0)
|
|
self.iqr_multiplier = self.model_config.get('iqr_multiplier', 1.5)
|
|
self.stats_cache = {}
|
|
|
|
def fit(self, data: pd.DataFrame) -> None:
|
|
"""Calculate statistical parameters for anomaly detection"""
|
|
for column in data.columns:
|
|
if data[column].dtype in ['int64', 'float64']:
|
|
values = data[column].dropna()
|
|
if len(values) > 0:
|
|
self.stats_cache[column] = {
|
|
'mean': values.mean(),
|
|
'std': values.std(),
|
|
'q1': values.quantile(0.25),
|
|
'q3': values.quantile(0.75),
|
|
'iqr': values.quantile(0.75) - values.quantile(0.25)
|
|
}
|
|
self.is_fitted = True
|
|
|
|
def predict(self, data: pd.DataFrame) -> np.ndarray:
|
|
"""Predict anomalies using statistical methods"""
|
|
if not self.is_fitted:
|
|
raise ValueError("Model must be fitted before prediction")
|
|
|
|
anomaly_flags = np.zeros(len(data), dtype=bool)
|
|
|
|
for column in data.columns:
|
|
if column in self.stats_cache and data[column].dtype in ['int64', 'float64']:
|
|
values = data[column].dropna()
|
|
if len(values) > 0:
|
|
stats = self.stats_cache[column]
|
|
|
|
# Z-score method
|
|
z_scores = np.abs((values - stats['mean']) / stats['std'])
|
|
z_anomalies = z_scores > self.z_threshold
|
|
|
|
# IQR method
|
|
lower_bound = stats['q1'] - self.iqr_multiplier * stats['iqr']
|
|
upper_bound = stats['q3'] + self.iqr_multiplier * stats['iqr']
|
|
iqr_anomalies = (values < lower_bound) | (values > upper_bound)
|
|
|
|
# Combine both methods
|
|
column_anomalies = z_anomalies | iqr_anomalies
|
|
anomaly_flags[values.index] |= column_anomalies
|
|
|
|
return anomaly_flags
|
|
|
|
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
|
|
"""Get anomaly scores based on z-scores"""
|
|
if not self.is_fitted:
|
|
raise ValueError("Model must be fitted before prediction")
|
|
|
|
scores = np.zeros(len(data))
|
|
|
|
for column in data.columns:
|
|
if column in self.stats_cache and data[column].dtype in ['int64', 'float64']:
|
|
values = data[column].dropna()
|
|
if len(values) > 0:
|
|
stats = self.stats_cache[column]
|
|
z_scores = np.abs((values - stats['mean']) / stats['std'])
|
|
scores[values.index] += z_scores
|
|
|
|
return scores
|
|
|
|
|
|
class IsolationForestAnomalyDetector(AnomalyDetector):
|
|
"""Isolation Forest anomaly detection"""
|
|
|
|
def __init__(self, model_config: Dict[str, Any] = None):
|
|
super().__init__(model_config)
|
|
self.contamination = self.model_config.get('contamination', 0.1)
|
|
self.n_estimators = self.model_config.get('n_estimators', 100)
|
|
self.model = IsolationForest(
|
|
contamination=self.contamination,
|
|
n_estimators=self.n_estimators,
|
|
random_state=42
|
|
)
|
|
|
|
def fit(self, data: pd.DataFrame) -> None:
|
|
"""Fit the Isolation Forest model"""
|
|
# Select numeric columns only
|
|
numeric_data = data.select_dtypes(include=[np.number])
|
|
|
|
if numeric_data.empty:
|
|
raise ValueError("No numeric columns found in data")
|
|
|
|
# Handle missing values
|
|
numeric_data = numeric_data.fillna(numeric_data.median())
|
|
|
|
# Scale the data
|
|
scaled_data = self.scaler.fit_transform(numeric_data)
|
|
|
|
# Fit the model
|
|
self.model.fit(scaled_data)
|
|
self.is_fitted = True
|
|
|
|
def predict(self, data: pd.DataFrame) -> np.ndarray:
|
|
"""Predict anomalies using Isolation Forest"""
|
|
if not self.is_fitted:
|
|
raise ValueError("Model must be fitted before prediction")
|
|
|
|
# Select numeric columns only
|
|
numeric_data = data.select_dtypes(include=[np.number])
|
|
|
|
if numeric_data.empty:
|
|
return np.zeros(len(data), dtype=bool)
|
|
|
|
# Handle missing values
|
|
numeric_data = numeric_data.fillna(numeric_data.median())
|
|
|
|
# Scale the data
|
|
scaled_data = self.scaler.transform(numeric_data)
|
|
|
|
# Predict anomalies (-1 for anomalies, 1 for normal)
|
|
predictions = self.model.predict(scaled_data)
|
|
return predictions == -1
|
|
|
|
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
|
|
"""Get anomaly scores from Isolation Forest"""
|
|
if not self.is_fitted:
|
|
raise ValueError("Model must be fitted before prediction")
|
|
|
|
# Select numeric columns only
|
|
numeric_data = data.select_dtypes(include=[np.number])
|
|
|
|
if numeric_data.empty:
|
|
return np.zeros(len(data))
|
|
|
|
# Handle missing values
|
|
numeric_data = numeric_data.fillna(numeric_data.median())
|
|
|
|
# Scale the data
|
|
scaled_data = self.scaler.transform(numeric_data)
|
|
|
|
# Get anomaly scores
|
|
scores = self.model.decision_function(scaled_data)
|
|
# Convert to positive scores (higher = more anomalous)
|
|
return -scores
|
|
|
|
|
|
class TemporalAnomalyDetector(AnomalyDetector):
|
|
"""Temporal anomaly detection for time series data"""
|
|
|
|
def __init__(self, model_config: Dict[str, Any] = None):
|
|
super().__init__(model_config)
|
|
self.window_size = self.model_config.get('window_size', 24) # hours
|
|
self.threshold_multiplier = self.model_config.get('threshold_multiplier', 2.0)
|
|
self.temporal_stats = {}
|
|
|
|
def fit(self, data: pd.DataFrame) -> None:
|
|
"""Calculate temporal statistics for anomaly detection"""
|
|
if 'timestamp' not in data.columns:
|
|
raise ValueError("Timestamp column is required for temporal anomaly detection")
|
|
|
|
# Sort by timestamp
|
|
data_sorted = data.sort_values('timestamp')
|
|
|
|
# Calculate rolling statistics
|
|
for column in data_sorted.columns:
|
|
if column != 'timestamp' and data_sorted[column].dtype in ['int64', 'float64']:
|
|
# Calculate rolling mean and std
|
|
rolling_mean = data_sorted[column].rolling(window=self.window_size, min_periods=1).mean()
|
|
rolling_std = data_sorted[column].rolling(window=self.window_size, min_periods=1).std()
|
|
|
|
self.temporal_stats[column] = {
|
|
'rolling_mean': rolling_mean,
|
|
'rolling_std': rolling_std
|
|
}
|
|
|
|
self.is_fitted = True
|
|
|
|
def predict(self, data: pd.DataFrame) -> np.ndarray:
|
|
"""Predict temporal anomalies"""
|
|
if not self.is_fitted:
|
|
raise ValueError("Model must be fitted before prediction")
|
|
|
|
if 'timestamp' not in data.columns:
|
|
return np.zeros(len(data), dtype=bool)
|
|
|
|
# Sort by timestamp
|
|
data_sorted = data.sort_values('timestamp')
|
|
anomaly_flags = np.zeros(len(data_sorted), dtype=bool)
|
|
|
|
for column in data_sorted.columns:
|
|
if column in self.temporal_stats and column != 'timestamp':
|
|
values = data_sorted[column]
|
|
rolling_mean = self.temporal_stats[column]['rolling_mean']
|
|
rolling_std = self.temporal_stats[column]['rolling_std']
|
|
|
|
# Calculate z-scores based on rolling statistics
|
|
z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8))
|
|
column_anomalies = z_scores > self.threshold_multiplier
|
|
|
|
anomaly_flags |= column_anomalies
|
|
|
|
return anomaly_flags
|
|
|
|
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
|
|
"""Get temporal anomaly scores"""
|
|
if not self.is_fitted:
|
|
raise ValueError("Model must be fitted before prediction")
|
|
|
|
if 'timestamp' not in data.columns:
|
|
return np.zeros(len(data))
|
|
|
|
# Sort by timestamp
|
|
data_sorted = data.sort_values('timestamp')
|
|
scores = np.zeros(len(data_sorted))
|
|
|
|
for column in data_sorted.columns:
|
|
if column in self.temporal_stats and column != 'timestamp':
|
|
values = data_sorted[column]
|
|
rolling_mean = self.temporal_stats[column]['rolling_mean']
|
|
rolling_std = self.temporal_stats[column]['rolling_std']
|
|
|
|
# Calculate z-scores based on rolling statistics
|
|
z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8))
|
|
scores += z_scores
|
|
|
|
return scores
|
|
|
|
|
|
class AnomalyDetectionEngine:
|
|
"""Main engine for anomaly detection"""
|
|
|
|
def __init__(self):
|
|
self.detectors = {
|
|
'statistical': StatisticalAnomalyDetector,
|
|
'isolation_forest': IsolationForestAnomalyDetector,
|
|
'temporal': TemporalAnomalyDetector
|
|
}
|
|
|
|
def create_detector(self, algorithm_type: str, model_config: Dict[str, Any] = None) -> AnomalyDetector:
|
|
"""Create an anomaly detector instance"""
|
|
if algorithm_type not in self.detectors:
|
|
raise ValueError(f"Unknown algorithm type: {algorithm_type}")
|
|
|
|
return self.detectors[algorithm_type](model_config)
|
|
|
|
def prepare_incident_data(self, time_window_hours: int = 24) -> pd.DataFrame:
|
|
"""Prepare incident data for anomaly detection"""
|
|
end_time = timezone.now()
|
|
start_time = end_time - timedelta(hours=time_window_hours)
|
|
|
|
# Get incidents from the time window
|
|
incidents = Incident.objects.filter(
|
|
created_at__gte=start_time,
|
|
created_at__lte=end_time
|
|
).values(
|
|
'id', 'created_at', 'severity', 'category', 'subcategory',
|
|
'affected_users', 'estimated_downtime', 'status'
|
|
)
|
|
|
|
if not incidents:
|
|
return pd.DataFrame()
|
|
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(list(incidents))
|
|
|
|
# Convert datetime to timestamp
|
|
df['timestamp'] = pd.to_datetime(df['created_at']).astype('int64') // 10**9
|
|
|
|
# Encode categorical variables
|
|
severity_mapping = {'LOW': 1, 'MEDIUM': 2, 'HIGH': 3, 'CRITICAL': 4, 'EMERGENCY': 5}
|
|
df['severity_encoded'] = df['severity'].map(severity_mapping).fillna(0)
|
|
|
|
# Convert estimated_downtime to hours
|
|
df['downtime_hours'] = df['estimated_downtime'].apply(
|
|
lambda x: x.total_seconds() / 3600 if x else 0
|
|
)
|
|
|
|
# Create time-based features
|
|
df['hour_of_day'] = pd.to_datetime(df['created_at']).dt.hour
|
|
df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek
|
|
|
|
return df
|
|
|
|
def detect_anomalies(self, model: PredictiveModel, time_window_hours: int = 24) -> List[Dict[str, Any]]:
|
|
"""Detect anomalies using the specified model"""
|
|
try:
|
|
# Prepare data
|
|
data = self.prepare_incident_data(time_window_hours)
|
|
|
|
if data.empty:
|
|
logger.warning("No incident data found for anomaly detection")
|
|
return []
|
|
|
|
# Create detector
|
|
detector = self.create_detector(
|
|
model.algorithm_type,
|
|
model.model_config
|
|
)
|
|
|
|
# Fit the model
|
|
detector.fit(data)
|
|
|
|
# Predict anomalies
|
|
anomaly_flags = detector.predict(data)
|
|
anomaly_scores = detector.get_anomaly_scores(data)
|
|
|
|
# Process results
|
|
anomalies = []
|
|
for idx, is_anomaly in enumerate(anomaly_flags):
|
|
if is_anomaly:
|
|
incident_data = data.iloc[idx]
|
|
anomaly_data = {
|
|
'model': model,
|
|
'anomaly_type': self._determine_anomaly_type(model.algorithm_type),
|
|
'severity': self._determine_severity(anomaly_scores[idx]),
|
|
'confidence_score': min(1.0, max(0.0, anomaly_scores[idx] / 10.0)),
|
|
'anomaly_score': float(anomaly_scores[idx]),
|
|
'threshold_used': self._get_threshold(model.algorithm_type, model.model_config),
|
|
'time_window_start': timezone.now() - timedelta(hours=time_window_hours),
|
|
'time_window_end': timezone.now(),
|
|
'description': self._generate_description(incident_data, anomaly_scores[idx]),
|
|
'affected_services': [incident_data.get('category', 'Unknown')],
|
|
'affected_metrics': ['incident_frequency', 'severity_distribution'],
|
|
'metadata': {
|
|
'incident_id': str(incident_data['id']),
|
|
'detection_algorithm': model.algorithm_type,
|
|
'time_window_hours': time_window_hours
|
|
}
|
|
}
|
|
anomalies.append(anomaly_data)
|
|
|
|
return anomalies
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in anomaly detection: {str(e)}")
|
|
return []
|
|
|
|
def _determine_anomaly_type(self, algorithm_type: str) -> str:
|
|
"""Determine anomaly type based on algorithm"""
|
|
mapping = {
|
|
'statistical': 'STATISTICAL',
|
|
'isolation_forest': 'PATTERN',
|
|
'temporal': 'TEMPORAL'
|
|
}
|
|
return mapping.get(algorithm_type, 'STATISTICAL')
|
|
|
|
def _determine_severity(self, anomaly_score: float) -> str:
|
|
"""Determine severity based on anomaly score"""
|
|
if anomaly_score >= 5.0:
|
|
return 'CRITICAL'
|
|
elif anomaly_score >= 3.0:
|
|
return 'HIGH'
|
|
elif anomaly_score >= 2.0:
|
|
return 'MEDIUM'
|
|
else:
|
|
return 'LOW'
|
|
|
|
def _get_threshold(self, algorithm_type: str, model_config: Dict[str, Any]) -> float:
|
|
"""Get threshold used for anomaly detection"""
|
|
if algorithm_type == 'statistical':
|
|
return model_config.get('z_threshold', 3.0)
|
|
elif algorithm_type == 'isolation_forest':
|
|
return model_config.get('contamination', 0.1)
|
|
elif algorithm_type == 'temporal':
|
|
return model_config.get('threshold_multiplier', 2.0)
|
|
return 1.0
|
|
|
|
def _generate_description(self, incident_data: pd.Series, anomaly_score: float) -> str:
|
|
"""Generate description for the anomaly"""
|
|
severity = incident_data.get('severity', 'Unknown')
|
|
category = incident_data.get('category', 'Unknown')
|
|
affected_users = incident_data.get('affected_users', 0)
|
|
|
|
return f"Anomalous incident detected: {severity} severity incident in {category} category affecting {affected_users} users. Anomaly score: {anomaly_score:.2f}"
|
|
|
|
|
|
class AnomalyDetectionService:
|
|
"""Service for managing anomaly detection"""
|
|
|
|
def __init__(self):
|
|
self.engine = AnomalyDetectionEngine()
|
|
|
|
def run_anomaly_detection(self, model_id: str = None) -> int:
|
|
"""Run anomaly detection for all active models or a specific model"""
|
|
if model_id:
|
|
models = PredictiveModel.objects.filter(
|
|
id=model_id,
|
|
model_type='ANOMALY_DETECTION',
|
|
status='ACTIVE'
|
|
)
|
|
else:
|
|
models = PredictiveModel.objects.filter(
|
|
model_type='ANOMALY_DETECTION',
|
|
status='ACTIVE'
|
|
)
|
|
|
|
total_anomalies = 0
|
|
|
|
for model in models:
|
|
try:
|
|
# Detect anomalies
|
|
anomalies = self.engine.detect_anomalies(model)
|
|
|
|
# Save anomalies to database
|
|
for anomaly_data in anomalies:
|
|
AnomalyDetection.objects.create(**anomaly_data)
|
|
total_anomalies += 1
|
|
|
|
logger.info(f"Detected {len(anomalies)} anomalies using model {model.name}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error running anomaly detection for model {model.name}: {str(e)}")
|
|
|
|
return total_anomalies
|
|
|
|
def get_anomaly_summary(self, time_window_hours: int = 24) -> Dict[str, Any]:
|
|
"""Get summary of recent anomalies"""
|
|
end_time = timezone.now()
|
|
start_time = end_time - timedelta(hours=time_window_hours)
|
|
|
|
anomalies = AnomalyDetection.objects.filter(
|
|
detected_at__gte=start_time,
|
|
detected_at__lte=end_time
|
|
)
|
|
|
|
return {
|
|
'total_anomalies': anomalies.count(),
|
|
'critical_anomalies': anomalies.filter(severity='CRITICAL').count(),
|
|
'high_anomalies': anomalies.filter(severity='HIGH').count(),
|
|
'medium_anomalies': anomalies.filter(severity='MEDIUM').count(),
|
|
'low_anomalies': anomalies.filter(severity='LOW').count(),
|
|
'unresolved_anomalies': anomalies.filter(
|
|
status__in=['DETECTED', 'INVESTIGATING']
|
|
).count(),
|
|
'false_positive_rate': self._calculate_false_positive_rate(anomalies),
|
|
'average_confidence': anomalies.aggregate(
|
|
avg=Avg('confidence_score')
|
|
)['avg'] or 0.0
|
|
}
|
|
|
|
def _calculate_false_positive_rate(self, anomalies) -> float:
|
|
"""Calculate false positive rate"""
|
|
total_anomalies = anomalies.count()
|
|
if total_anomalies == 0:
|
|
return 0.0
|
|
|
|
false_positives = anomalies.filter(status='FALSE_POSITIVE').count()
|
|
return (false_positives / total_anomalies) * 100
|