Files
ETB/ETB-API/incident_intelligence/ai/correlation.py
Iliyan Angelov 6b247e5b9f Updates
2025-09-19 11:58:53 +03:00

482 lines
20 KiB
Python

"""
Correlation engine for linking related incidents and problem detection
"""
import time
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime, timedelta
from django.utils import timezone
from .classification import IncidentClassifier
@dataclass
class CorrelationResult:
"""Result of incident correlation analysis"""
correlation_type: str
confidence_score: float
correlation_strength: str
shared_keywords: List[str]
time_difference: timedelta
similarity_score: float
is_problem_indicator: bool
problem_description: Optional[str]
class IncidentCorrelationEngine:
"""
AI-driven correlation engine for linking related incidents
"""
def __init__(self):
self.model_version = "v1.0"
self.classifier = IncidentClassifier()
# Correlation thresholds
self.correlation_thresholds = {
'VERY_STRONG': 0.9,
'STRONG': 0.7,
'MODERATE': 0.5,
'WEAK': 0.3
}
# Problem detection patterns
self.problem_patterns = {
'CASCADE_FAILURE': {
'keywords': ['cascade', 'chain', 'reaction', 'domino', 'ripple', 'effect'],
'time_window': timedelta(hours=2),
'min_incidents': 3
},
'RECURRING_ISSUE': {
'keywords': ['same', 'again', 'recurring', 'repeated', 'similar', 'identical'],
'time_window': timedelta(days=7),
'min_incidents': 2
},
'SERVICE_DEPENDENCY': {
'keywords': ['dependency', 'dependent', 'downstream', 'upstream', 'service', 'api'],
'time_window': timedelta(hours=1),
'min_incidents': 2
},
'INFRASTRUCTURE_PATTERN': {
'keywords': ['server', 'database', 'network', 'storage', 'infrastructure'],
'time_window': timedelta(hours=4),
'min_incidents': 3
}
}
def correlate_incidents(self, incident_a: Dict, incident_b: Dict) -> Optional[CorrelationResult]:
"""
Correlate two incidents and determine if they are related
"""
# Calculate various similarity metrics
text_similarity = self._calculate_text_similarity(incident_a, incident_b)
temporal_similarity = self._calculate_temporal_similarity(incident_a, incident_b)
service_similarity = self._calculate_service_similarity(incident_a, incident_b)
category_similarity = self._calculate_category_similarity(incident_a, incident_b)
# Calculate overall similarity score
overall_similarity = (
text_similarity * 0.4 +
temporal_similarity * 0.2 +
service_similarity * 0.2 +
category_similarity * 0.2
)
# Determine if incidents are correlated
if overall_similarity < 0.3:
return None
# Determine correlation type
correlation_type = self._determine_correlation_type(
incident_a, incident_b, text_similarity, temporal_similarity, service_similarity
)
# Calculate confidence score
confidence_score = self._calculate_confidence_score(
overall_similarity, correlation_type, incident_a, incident_b
)
# Determine correlation strength
correlation_strength = self._determine_correlation_strength(confidence_score)
# Extract shared keywords
shared_keywords = self._extract_shared_keywords(incident_a, incident_b)
# Calculate time difference
time_diff = self._calculate_time_difference(incident_a, incident_b)
# Check for problem indicators
is_problem_indicator, problem_description = self._detect_problem_patterns(
incident_a, incident_b, correlation_type, confidence_score
)
return CorrelationResult(
correlation_type=correlation_type,
confidence_score=confidence_score,
correlation_strength=correlation_strength,
shared_keywords=shared_keywords,
time_difference=time_diff,
similarity_score=overall_similarity,
is_problem_indicator=is_problem_indicator,
problem_description=problem_description
)
def _calculate_text_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate text similarity between two incidents"""
# Combine text fields
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')} {incident_a.get('free_text', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')} {incident_b.get('free_text', '')}".lower()
# Extract keywords
keywords_a = set(self.classifier._extract_keywords(text_a))
keywords_b = set(self.classifier._extract_keywords(text_b))
if not keywords_a or not keywords_b:
return 0.0
# Calculate Jaccard similarity
intersection = len(keywords_a.intersection(keywords_b))
union = len(keywords_a.union(keywords_b))
jaccard_similarity = intersection / union if union > 0 else 0.0
# Also check for exact phrase matches
phrase_similarity = self._calculate_phrase_similarity(text_a, text_b)
# Combine similarities
return (jaccard_similarity * 0.7 + phrase_similarity * 0.3)
def _calculate_phrase_similarity(self, text_a: str, text_b: str) -> float:
"""Calculate similarity based on common phrases"""
# Extract 2-3 word phrases
phrases_a = set()
phrases_b = set()
words_a = text_a.split()
words_b = text_b.split()
# Extract 2-word phrases
for i in range(len(words_a) - 1):
phrases_a.add(f"{words_a[i]} {words_a[i+1]}")
for i in range(len(words_b) - 1):
phrases_b.add(f"{words_b[i]} {words_b[i+1]}")
# Extract 3-word phrases
for i in range(len(words_a) - 2):
phrases_a.add(f"{words_a[i]} {words_a[i+1]} {words_a[i+2]}")
for i in range(len(words_b) - 2):
phrases_b.add(f"{words_b[i]} {words_b[i+1]} {words_b[i+2]}")
if not phrases_a or not phrases_b:
return 0.0
intersection = len(phrases_a.intersection(phrases_b))
union = len(phrases_a.union(phrases_b))
return intersection / union if union > 0 else 0.0
def _calculate_temporal_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate temporal similarity between incidents"""
created_a = incident_a.get('created_at')
created_b = incident_b.get('created_at')
if not created_a or not created_b:
return 0.0
# Convert to datetime if needed
if isinstance(created_a, str):
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
if isinstance(created_b, str):
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
time_diff = abs((created_a - created_b).total_seconds())
# Calculate similarity based on time difference
# Incidents within 1 hour: high similarity
# Incidents within 24 hours: medium similarity
# Incidents within 7 days: low similarity
if time_diff <= 3600: # 1 hour
return 1.0
elif time_diff <= 86400: # 24 hours
return 0.7
elif time_diff <= 604800: # 7 days
return 0.3
else:
return 0.0
def _calculate_service_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate service/component similarity"""
# Extract service/component information from text
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
# Common service/component keywords
service_keywords = [
'api', 'service', 'database', 'server', 'application', 'website', 'mobile',
'frontend', 'backend', 'microservice', 'gateway', 'load balancer', 'cache',
'queue', 'message', 'notification', 'email', 'sms', 'payment', 'auth'
]
services_a = set()
services_b = set()
for keyword in service_keywords:
if keyword in text_a:
services_a.add(keyword)
if keyword in text_b:
services_b.add(keyword)
if not services_a or not services_b:
return 0.0
intersection = len(services_a.intersection(services_b))
union = len(services_a.union(services_b))
return intersection / union if union > 0 else 0.0
def _calculate_category_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate category similarity"""
category_a = incident_a.get('category', '')
category_b = incident_b.get('category', '')
if not category_a or not category_b:
return 0.0
if category_a == category_b:
return 1.0
# Check for related categories
related_categories = {
'INFRASTRUCTURE': ['APPLICATION', 'SECURITY'],
'APPLICATION': ['INFRASTRUCTURE', 'USER_EXPERIENCE'],
'SECURITY': ['INFRASTRUCTURE', 'APPLICATION'],
'USER_EXPERIENCE': ['APPLICATION', 'DATA'],
'DATA': ['USER_EXPERIENCE', 'INTEGRATION'],
'INTEGRATION': ['DATA', 'APPLICATION']
}
if category_b in related_categories.get(category_a, []):
return 0.5
return 0.0
def _determine_correlation_type(self, incident_a: Dict, incident_b: Dict,
text_similarity: float, temporal_similarity: float,
service_similarity: float) -> str:
"""Determine the type of correlation between incidents"""
# Same service correlation
if service_similarity > 0.7:
return 'SAME_SERVICE'
# Same component correlation
if text_similarity > 0.6 and service_similarity > 0.4:
return 'SAME_COMPONENT'
# Temporal correlation
if temporal_similarity > 0.7 and text_similarity > 0.3:
return 'TEMPORAL'
# Pattern match
if text_similarity > 0.5:
return 'PATTERN'
# Dependency correlation
if service_similarity > 0.4 and temporal_similarity > 0.5:
return 'DEPENDENCY'
# Cascade effect
if temporal_similarity > 0.8 and text_similarity > 0.4:
return 'CASCADE'
return 'PATTERN' # Default
def _calculate_confidence_score(self, overall_similarity: float, correlation_type: str,
incident_a: Dict, incident_b: Dict) -> float:
"""Calculate confidence score for the correlation"""
base_confidence = overall_similarity
# Adjust based on correlation type
type_adjustments = {
'SAME_SERVICE': 0.1,
'SAME_COMPONENT': 0.15,
'TEMPORAL': 0.05,
'PATTERN': 0.0,
'DEPENDENCY': 0.1,
'CASCADE': 0.2
}
base_confidence += type_adjustments.get(correlation_type, 0.0)
# Adjust based on incident characteristics
if incident_a.get('severity') == incident_b.get('severity'):
base_confidence += 0.05
if incident_a.get('status') == incident_b.get('status'):
base_confidence += 0.03
return min(base_confidence, 1.0)
def _determine_correlation_strength(self, confidence_score: float) -> str:
"""Determine correlation strength based on confidence score"""
if confidence_score >= self.correlation_thresholds['VERY_STRONG']:
return 'VERY_STRONG'
elif confidence_score >= self.correlation_thresholds['STRONG']:
return 'STRONG'
elif confidence_score >= self.correlation_thresholds['MODERATE']:
return 'MODERATE'
else:
return 'WEAK'
def _extract_shared_keywords(self, incident_a: Dict, incident_b: Dict) -> List[str]:
"""Extract keywords shared between incidents"""
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
keywords_a = set(self.classifier._extract_keywords(text_a))
keywords_b = set(self.classifier._extract_keywords(text_b))
shared = list(keywords_a.intersection(keywords_b))
return shared[:10] # Return top 10 shared keywords
def _calculate_time_difference(self, incident_a: Dict, incident_b: Dict) -> timedelta:
"""Calculate time difference between incidents"""
created_a = incident_a.get('created_at')
created_b = incident_b.get('created_at')
if not created_a or not created_b:
return timedelta(0)
# Convert to datetime if needed
if isinstance(created_a, str):
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
if isinstance(created_b, str):
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
return abs(created_a - created_b)
def _detect_problem_patterns(self, incident_a: Dict, incident_b: Dict,
correlation_type: str, confidence_score: float) -> Tuple[bool, Optional[str]]:
"""Detect if correlation indicates a larger problem"""
# High confidence correlations are more likely to indicate problems
if confidence_score < 0.6:
return False, None
# Check for specific problem patterns
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
combined_text = f"{text_a} {text_b}"
for pattern_name, pattern_data in self.problem_patterns.items():
# Check for pattern keywords
keyword_matches = sum(1 for keyword in pattern_data['keywords'] if keyword in combined_text)
if keyword_matches >= 2: # At least 2 keywords match
return True, f"Potential {pattern_name.replace('_', ' ').lower()} detected"
# Check for cascade effects
if correlation_type == 'CASCADE' and confidence_score > 0.7:
return True, "Potential cascade failure detected"
# Check for recurring issues
if correlation_type == 'SAME_SERVICE' and confidence_score > 0.8:
return True, "Potential recurring service issue detected"
return False, None
def find_related_incidents(self, target_incident: Dict, all_incidents: List[Dict],
limit: int = 10) -> List[Tuple[Dict, CorrelationResult]]:
"""Find incidents related to a target incident"""
correlations = []
for incident in all_incidents:
if incident['id'] == target_incident['id']:
continue
correlation = self.correlate_incidents(target_incident, incident)
if correlation:
correlations.append((incident, correlation))
# Sort by confidence score and return top results
correlations.sort(key=lambda x: x[1].confidence_score, reverse=True)
return correlations[:limit]
def detect_problem_clusters(self, incidents: List[Dict],
min_incidents: int = 3,
time_window: timedelta = timedelta(hours=24)) -> List[Dict]:
"""Detect clusters of related incidents that might indicate larger problems"""
clusters = []
processed_incidents = set()
for incident in incidents:
if incident['id'] in processed_incidents:
continue
# Find related incidents within time window
related_incidents = []
incident_time = incident.get('created_at')
if isinstance(incident_time, str):
incident_time = datetime.fromisoformat(incident_time.replace('Z', '+00:00'))
for other_incident in incidents:
if other_incident['id'] == incident['id'] or other_incident['id'] in processed_incidents:
continue
other_time = other_incident.get('created_at')
if isinstance(other_time, str):
other_time = datetime.fromisoformat(other_time.replace('Z', '+00:00'))
# Check if within time window
if abs((incident_time - other_time).total_seconds()) <= time_window.total_seconds():
correlation = self.correlate_incidents(incident, other_incident)
if correlation and correlation.confidence_score > 0.5:
related_incidents.append((other_incident, correlation))
# If we found enough related incidents, create a cluster
if len(related_incidents) >= min_incidents - 1: # -1 because we include the original incident
cluster = {
'incidents': [incident] + [inc[0] for inc in related_incidents],
'correlations': [inc[1] for inc in related_incidents],
'problem_type': self._classify_problem_type(incident, related_incidents),
'confidence': sum(inc[1].confidence_score for inc in related_incidents) / len(related_incidents),
'time_span': self._calculate_cluster_time_span([incident] + [inc[0] for inc in related_incidents])
}
clusters.append(cluster)
# Mark incidents as processed
processed_incidents.add(incident['id'])
for related_incident, _ in related_incidents:
processed_incidents.add(related_incident['id'])
return clusters
def _classify_problem_type(self, incident: Dict, related_incidents: List[Tuple[Dict, CorrelationResult]]) -> str:
"""Classify the type of problem based on incident cluster"""
correlation_types = [corr.correlation_type for _, corr in related_incidents]
if 'CASCADE' in correlation_types:
return 'CASCADE_FAILURE'
elif 'SAME_SERVICE' in correlation_types:
return 'SERVICE_OUTAGE'
elif 'TEMPORAL' in correlation_types:
return 'RECURRING_ISSUE'
else:
return 'PATTERN_BASED_PROBLEM'
def _calculate_cluster_time_span(self, incidents: List[Dict]) -> timedelta:
"""Calculate the time span of a cluster of incidents"""
times = []
for incident in incidents:
created_at = incident.get('created_at')
if isinstance(created_at, str):
created_at = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
times.append(created_at)
if len(times) < 2:
return timedelta(0)
return max(times) - min(times)