Files
ETB/ETB-API/analytics_predictive_insights/ml/anomaly_detection.py
Iliyan Angelov 6b247e5b9f Updates
2025-09-19 11:58:53 +03:00

492 lines
19 KiB
Python

"""
ML-based anomaly detection for incident management
Implements various anomaly detection algorithms for identifying unusual patterns
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional, Any
from datetime import datetime, timedelta
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from scipy import stats
import logging
from django.utils import timezone
from django.db.models import Q, Avg, Count, Sum
from incident_intelligence.models import Incident
from ..models import AnomalyDetection, PredictiveModel
logger = logging.getLogger(__name__)
class AnomalyDetector:
"""Base class for anomaly detection algorithms"""
def __init__(self, model_config: Dict[str, Any] = None):
self.model_config = model_config or {}
self.scaler = StandardScaler()
self.is_fitted = False
def fit(self, data: pd.DataFrame) -> None:
"""Fit the anomaly detection model"""
raise NotImplementedError
def predict(self, data: pd.DataFrame) -> np.ndarray:
"""Predict anomalies in the data"""
raise NotImplementedError
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
"""Get anomaly scores for the data"""
raise NotImplementedError
class StatisticalAnomalyDetector(AnomalyDetector):
"""Statistical anomaly detection using z-score and IQR methods"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.z_threshold = self.model_config.get('z_threshold', 3.0)
self.iqr_multiplier = self.model_config.get('iqr_multiplier', 1.5)
self.stats_cache = {}
def fit(self, data: pd.DataFrame) -> None:
"""Calculate statistical parameters for anomaly detection"""
for column in data.columns:
if data[column].dtype in ['int64', 'float64']:
values = data[column].dropna()
if len(values) > 0:
self.stats_cache[column] = {
'mean': values.mean(),
'std': values.std(),
'q1': values.quantile(0.25),
'q3': values.quantile(0.75),
'iqr': values.quantile(0.75) - values.quantile(0.25)
}
self.is_fitted = True
def predict(self, data: pd.DataFrame) -> np.ndarray:
"""Predict anomalies using statistical methods"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
anomaly_flags = np.zeros(len(data), dtype=bool)
for column in data.columns:
if column in self.stats_cache and data[column].dtype in ['int64', 'float64']:
values = data[column].dropna()
if len(values) > 0:
stats = self.stats_cache[column]
# Z-score method
z_scores = np.abs((values - stats['mean']) / stats['std'])
z_anomalies = z_scores > self.z_threshold
# IQR method
lower_bound = stats['q1'] - self.iqr_multiplier * stats['iqr']
upper_bound = stats['q3'] + self.iqr_multiplier * stats['iqr']
iqr_anomalies = (values < lower_bound) | (values > upper_bound)
# Combine both methods
column_anomalies = z_anomalies | iqr_anomalies
anomaly_flags[values.index] |= column_anomalies
return anomaly_flags
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
"""Get anomaly scores based on z-scores"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
scores = np.zeros(len(data))
for column in data.columns:
if column in self.stats_cache and data[column].dtype in ['int64', 'float64']:
values = data[column].dropna()
if len(values) > 0:
stats = self.stats_cache[column]
z_scores = np.abs((values - stats['mean']) / stats['std'])
scores[values.index] += z_scores
return scores
class IsolationForestAnomalyDetector(AnomalyDetector):
"""Isolation Forest anomaly detection"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.contamination = self.model_config.get('contamination', 0.1)
self.n_estimators = self.model_config.get('n_estimators', 100)
self.model = IsolationForest(
contamination=self.contamination,
n_estimators=self.n_estimators,
random_state=42
)
def fit(self, data: pd.DataFrame) -> None:
"""Fit the Isolation Forest model"""
# Select numeric columns only
numeric_data = data.select_dtypes(include=[np.number])
if numeric_data.empty:
raise ValueError("No numeric columns found in data")
# Handle missing values
numeric_data = numeric_data.fillna(numeric_data.median())
# Scale the data
scaled_data = self.scaler.fit_transform(numeric_data)
# Fit the model
self.model.fit(scaled_data)
self.is_fitted = True
def predict(self, data: pd.DataFrame) -> np.ndarray:
"""Predict anomalies using Isolation Forest"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
# Select numeric columns only
numeric_data = data.select_dtypes(include=[np.number])
if numeric_data.empty:
return np.zeros(len(data), dtype=bool)
# Handle missing values
numeric_data = numeric_data.fillna(numeric_data.median())
# Scale the data
scaled_data = self.scaler.transform(numeric_data)
# Predict anomalies (-1 for anomalies, 1 for normal)
predictions = self.model.predict(scaled_data)
return predictions == -1
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
"""Get anomaly scores from Isolation Forest"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
# Select numeric columns only
numeric_data = data.select_dtypes(include=[np.number])
if numeric_data.empty:
return np.zeros(len(data))
# Handle missing values
numeric_data = numeric_data.fillna(numeric_data.median())
# Scale the data
scaled_data = self.scaler.transform(numeric_data)
# Get anomaly scores
scores = self.model.decision_function(scaled_data)
# Convert to positive scores (higher = more anomalous)
return -scores
class TemporalAnomalyDetector(AnomalyDetector):
"""Temporal anomaly detection for time series data"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.window_size = self.model_config.get('window_size', 24) # hours
self.threshold_multiplier = self.model_config.get('threshold_multiplier', 2.0)
self.temporal_stats = {}
def fit(self, data: pd.DataFrame) -> None:
"""Calculate temporal statistics for anomaly detection"""
if 'timestamp' not in data.columns:
raise ValueError("Timestamp column is required for temporal anomaly detection")
# Sort by timestamp
data_sorted = data.sort_values('timestamp')
# Calculate rolling statistics
for column in data_sorted.columns:
if column != 'timestamp' and data_sorted[column].dtype in ['int64', 'float64']:
# Calculate rolling mean and std
rolling_mean = data_sorted[column].rolling(window=self.window_size, min_periods=1).mean()
rolling_std = data_sorted[column].rolling(window=self.window_size, min_periods=1).std()
self.temporal_stats[column] = {
'rolling_mean': rolling_mean,
'rolling_std': rolling_std
}
self.is_fitted = True
def predict(self, data: pd.DataFrame) -> np.ndarray:
"""Predict temporal anomalies"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
if 'timestamp' not in data.columns:
return np.zeros(len(data), dtype=bool)
# Sort by timestamp
data_sorted = data.sort_values('timestamp')
anomaly_flags = np.zeros(len(data_sorted), dtype=bool)
for column in data_sorted.columns:
if column in self.temporal_stats and column != 'timestamp':
values = data_sorted[column]
rolling_mean = self.temporal_stats[column]['rolling_mean']
rolling_std = self.temporal_stats[column]['rolling_std']
# Calculate z-scores based on rolling statistics
z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8))
column_anomalies = z_scores > self.threshold_multiplier
anomaly_flags |= column_anomalies
return anomaly_flags
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
"""Get temporal anomaly scores"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
if 'timestamp' not in data.columns:
return np.zeros(len(data))
# Sort by timestamp
data_sorted = data.sort_values('timestamp')
scores = np.zeros(len(data_sorted))
for column in data_sorted.columns:
if column in self.temporal_stats and column != 'timestamp':
values = data_sorted[column]
rolling_mean = self.temporal_stats[column]['rolling_mean']
rolling_std = self.temporal_stats[column]['rolling_std']
# Calculate z-scores based on rolling statistics
z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8))
scores += z_scores
return scores
class AnomalyDetectionEngine:
"""Main engine for anomaly detection"""
def __init__(self):
self.detectors = {
'statistical': StatisticalAnomalyDetector,
'isolation_forest': IsolationForestAnomalyDetector,
'temporal': TemporalAnomalyDetector
}
def create_detector(self, algorithm_type: str, model_config: Dict[str, Any] = None) -> AnomalyDetector:
"""Create an anomaly detector instance"""
if algorithm_type not in self.detectors:
raise ValueError(f"Unknown algorithm type: {algorithm_type}")
return self.detectors[algorithm_type](model_config)
def prepare_incident_data(self, time_window_hours: int = 24) -> pd.DataFrame:
"""Prepare incident data for anomaly detection"""
end_time = timezone.now()
start_time = end_time - timedelta(hours=time_window_hours)
# Get incidents from the time window
incidents = Incident.objects.filter(
created_at__gte=start_time,
created_at__lte=end_time
).values(
'id', 'created_at', 'severity', 'category', 'subcategory',
'affected_users', 'estimated_downtime', 'status'
)
if not incidents:
return pd.DataFrame()
# Convert to DataFrame
df = pd.DataFrame(list(incidents))
# Convert datetime to timestamp
df['timestamp'] = pd.to_datetime(df['created_at']).astype('int64') // 10**9
# Encode categorical variables
severity_mapping = {'LOW': 1, 'MEDIUM': 2, 'HIGH': 3, 'CRITICAL': 4, 'EMERGENCY': 5}
df['severity_encoded'] = df['severity'].map(severity_mapping).fillna(0)
# Convert estimated_downtime to hours
df['downtime_hours'] = df['estimated_downtime'].apply(
lambda x: x.total_seconds() / 3600 if x else 0
)
# Create time-based features
df['hour_of_day'] = pd.to_datetime(df['created_at']).dt.hour
df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek
return df
def detect_anomalies(self, model: PredictiveModel, time_window_hours: int = 24) -> List[Dict[str, Any]]:
"""Detect anomalies using the specified model"""
try:
# Prepare data
data = self.prepare_incident_data(time_window_hours)
if data.empty:
logger.warning("No incident data found for anomaly detection")
return []
# Create detector
detector = self.create_detector(
model.algorithm_type,
model.model_config
)
# Fit the model
detector.fit(data)
# Predict anomalies
anomaly_flags = detector.predict(data)
anomaly_scores = detector.get_anomaly_scores(data)
# Process results
anomalies = []
for idx, is_anomaly in enumerate(anomaly_flags):
if is_anomaly:
incident_data = data.iloc[idx]
anomaly_data = {
'model': model,
'anomaly_type': self._determine_anomaly_type(model.algorithm_type),
'severity': self._determine_severity(anomaly_scores[idx]),
'confidence_score': min(1.0, max(0.0, anomaly_scores[idx] / 10.0)),
'anomaly_score': float(anomaly_scores[idx]),
'threshold_used': self._get_threshold(model.algorithm_type, model.model_config),
'time_window_start': timezone.now() - timedelta(hours=time_window_hours),
'time_window_end': timezone.now(),
'description': self._generate_description(incident_data, anomaly_scores[idx]),
'affected_services': [incident_data.get('category', 'Unknown')],
'affected_metrics': ['incident_frequency', 'severity_distribution'],
'metadata': {
'incident_id': str(incident_data['id']),
'detection_algorithm': model.algorithm_type,
'time_window_hours': time_window_hours
}
}
anomalies.append(anomaly_data)
return anomalies
except Exception as e:
logger.error(f"Error in anomaly detection: {str(e)}")
return []
def _determine_anomaly_type(self, algorithm_type: str) -> str:
"""Determine anomaly type based on algorithm"""
mapping = {
'statistical': 'STATISTICAL',
'isolation_forest': 'PATTERN',
'temporal': 'TEMPORAL'
}
return mapping.get(algorithm_type, 'STATISTICAL')
def _determine_severity(self, anomaly_score: float) -> str:
"""Determine severity based on anomaly score"""
if anomaly_score >= 5.0:
return 'CRITICAL'
elif anomaly_score >= 3.0:
return 'HIGH'
elif anomaly_score >= 2.0:
return 'MEDIUM'
else:
return 'LOW'
def _get_threshold(self, algorithm_type: str, model_config: Dict[str, Any]) -> float:
"""Get threshold used for anomaly detection"""
if algorithm_type == 'statistical':
return model_config.get('z_threshold', 3.0)
elif algorithm_type == 'isolation_forest':
return model_config.get('contamination', 0.1)
elif algorithm_type == 'temporal':
return model_config.get('threshold_multiplier', 2.0)
return 1.0
def _generate_description(self, incident_data: pd.Series, anomaly_score: float) -> str:
"""Generate description for the anomaly"""
severity = incident_data.get('severity', 'Unknown')
category = incident_data.get('category', 'Unknown')
affected_users = incident_data.get('affected_users', 0)
return f"Anomalous incident detected: {severity} severity incident in {category} category affecting {affected_users} users. Anomaly score: {anomaly_score:.2f}"
class AnomalyDetectionService:
"""Service for managing anomaly detection"""
def __init__(self):
self.engine = AnomalyDetectionEngine()
def run_anomaly_detection(self, model_id: str = None) -> int:
"""Run anomaly detection for all active models or a specific model"""
if model_id:
models = PredictiveModel.objects.filter(
id=model_id,
model_type='ANOMALY_DETECTION',
status='ACTIVE'
)
else:
models = PredictiveModel.objects.filter(
model_type='ANOMALY_DETECTION',
status='ACTIVE'
)
total_anomalies = 0
for model in models:
try:
# Detect anomalies
anomalies = self.engine.detect_anomalies(model)
# Save anomalies to database
for anomaly_data in anomalies:
AnomalyDetection.objects.create(**anomaly_data)
total_anomalies += 1
logger.info(f"Detected {len(anomalies)} anomalies using model {model.name}")
except Exception as e:
logger.error(f"Error running anomaly detection for model {model.name}: {str(e)}")
return total_anomalies
def get_anomaly_summary(self, time_window_hours: int = 24) -> Dict[str, Any]:
"""Get summary of recent anomalies"""
end_time = timezone.now()
start_time = end_time - timedelta(hours=time_window_hours)
anomalies = AnomalyDetection.objects.filter(
detected_at__gte=start_time,
detected_at__lte=end_time
)
return {
'total_anomalies': anomalies.count(),
'critical_anomalies': anomalies.filter(severity='CRITICAL').count(),
'high_anomalies': anomalies.filter(severity='HIGH').count(),
'medium_anomalies': anomalies.filter(severity='MEDIUM').count(),
'low_anomalies': anomalies.filter(severity='LOW').count(),
'unresolved_anomalies': anomalies.filter(
status__in=['DETECTED', 'INVESTIGATING']
).count(),
'false_positive_rate': self._calculate_false_positive_rate(anomalies),
'average_confidence': anomalies.aggregate(
avg=Avg('confidence_score')
)['avg'] or 0.0
}
def _calculate_false_positive_rate(self, anomalies) -> float:
"""Calculate false positive rate"""
total_anomalies = anomalies.count()
if total_anomalies == 0:
return 0.0
false_positives = anomalies.filter(status='FALSE_POSITIVE').count()
return (false_positives / total_anomalies) * 100