482 lines
20 KiB
Python
482 lines
20 KiB
Python
"""
|
|
Correlation engine for linking related incidents and problem detection
|
|
"""
|
|
import time
|
|
from typing import Dict, List, Tuple, Optional
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timedelta
|
|
from django.utils import timezone
|
|
from .classification import IncidentClassifier
|
|
|
|
|
|
@dataclass
|
|
class CorrelationResult:
|
|
"""Result of incident correlation analysis"""
|
|
correlation_type: str
|
|
confidence_score: float
|
|
correlation_strength: str
|
|
shared_keywords: List[str]
|
|
time_difference: timedelta
|
|
similarity_score: float
|
|
is_problem_indicator: bool
|
|
problem_description: Optional[str]
|
|
|
|
|
|
class IncidentCorrelationEngine:
|
|
"""
|
|
AI-driven correlation engine for linking related incidents
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.model_version = "v1.0"
|
|
self.classifier = IncidentClassifier()
|
|
|
|
# Correlation thresholds
|
|
self.correlation_thresholds = {
|
|
'VERY_STRONG': 0.9,
|
|
'STRONG': 0.7,
|
|
'MODERATE': 0.5,
|
|
'WEAK': 0.3
|
|
}
|
|
|
|
# Problem detection patterns
|
|
self.problem_patterns = {
|
|
'CASCADE_FAILURE': {
|
|
'keywords': ['cascade', 'chain', 'reaction', 'domino', 'ripple', 'effect'],
|
|
'time_window': timedelta(hours=2),
|
|
'min_incidents': 3
|
|
},
|
|
'RECURRING_ISSUE': {
|
|
'keywords': ['same', 'again', 'recurring', 'repeated', 'similar', 'identical'],
|
|
'time_window': timedelta(days=7),
|
|
'min_incidents': 2
|
|
},
|
|
'SERVICE_DEPENDENCY': {
|
|
'keywords': ['dependency', 'dependent', 'downstream', 'upstream', 'service', 'api'],
|
|
'time_window': timedelta(hours=1),
|
|
'min_incidents': 2
|
|
},
|
|
'INFRASTRUCTURE_PATTERN': {
|
|
'keywords': ['server', 'database', 'network', 'storage', 'infrastructure'],
|
|
'time_window': timedelta(hours=4),
|
|
'min_incidents': 3
|
|
}
|
|
}
|
|
|
|
def correlate_incidents(self, incident_a: Dict, incident_b: Dict) -> Optional[CorrelationResult]:
|
|
"""
|
|
Correlate two incidents and determine if they are related
|
|
"""
|
|
# Calculate various similarity metrics
|
|
text_similarity = self._calculate_text_similarity(incident_a, incident_b)
|
|
temporal_similarity = self._calculate_temporal_similarity(incident_a, incident_b)
|
|
service_similarity = self._calculate_service_similarity(incident_a, incident_b)
|
|
category_similarity = self._calculate_category_similarity(incident_a, incident_b)
|
|
|
|
# Calculate overall similarity score
|
|
overall_similarity = (
|
|
text_similarity * 0.4 +
|
|
temporal_similarity * 0.2 +
|
|
service_similarity * 0.2 +
|
|
category_similarity * 0.2
|
|
)
|
|
|
|
# Determine if incidents are correlated
|
|
if overall_similarity < 0.3:
|
|
return None
|
|
|
|
# Determine correlation type
|
|
correlation_type = self._determine_correlation_type(
|
|
incident_a, incident_b, text_similarity, temporal_similarity, service_similarity
|
|
)
|
|
|
|
# Calculate confidence score
|
|
confidence_score = self._calculate_confidence_score(
|
|
overall_similarity, correlation_type, incident_a, incident_b
|
|
)
|
|
|
|
# Determine correlation strength
|
|
correlation_strength = self._determine_correlation_strength(confidence_score)
|
|
|
|
# Extract shared keywords
|
|
shared_keywords = self._extract_shared_keywords(incident_a, incident_b)
|
|
|
|
# Calculate time difference
|
|
time_diff = self._calculate_time_difference(incident_a, incident_b)
|
|
|
|
# Check for problem indicators
|
|
is_problem_indicator, problem_description = self._detect_problem_patterns(
|
|
incident_a, incident_b, correlation_type, confidence_score
|
|
)
|
|
|
|
return CorrelationResult(
|
|
correlation_type=correlation_type,
|
|
confidence_score=confidence_score,
|
|
correlation_strength=correlation_strength,
|
|
shared_keywords=shared_keywords,
|
|
time_difference=time_diff,
|
|
similarity_score=overall_similarity,
|
|
is_problem_indicator=is_problem_indicator,
|
|
problem_description=problem_description
|
|
)
|
|
|
|
def _calculate_text_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
|
"""Calculate text similarity between two incidents"""
|
|
# Combine text fields
|
|
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')} {incident_a.get('free_text', '')}".lower()
|
|
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')} {incident_b.get('free_text', '')}".lower()
|
|
|
|
# Extract keywords
|
|
keywords_a = set(self.classifier._extract_keywords(text_a))
|
|
keywords_b = set(self.classifier._extract_keywords(text_b))
|
|
|
|
if not keywords_a or not keywords_b:
|
|
return 0.0
|
|
|
|
# Calculate Jaccard similarity
|
|
intersection = len(keywords_a.intersection(keywords_b))
|
|
union = len(keywords_a.union(keywords_b))
|
|
|
|
jaccard_similarity = intersection / union if union > 0 else 0.0
|
|
|
|
# Also check for exact phrase matches
|
|
phrase_similarity = self._calculate_phrase_similarity(text_a, text_b)
|
|
|
|
# Combine similarities
|
|
return (jaccard_similarity * 0.7 + phrase_similarity * 0.3)
|
|
|
|
def _calculate_phrase_similarity(self, text_a: str, text_b: str) -> float:
|
|
"""Calculate similarity based on common phrases"""
|
|
# Extract 2-3 word phrases
|
|
phrases_a = set()
|
|
phrases_b = set()
|
|
|
|
words_a = text_a.split()
|
|
words_b = text_b.split()
|
|
|
|
# Extract 2-word phrases
|
|
for i in range(len(words_a) - 1):
|
|
phrases_a.add(f"{words_a[i]} {words_a[i+1]}")
|
|
|
|
for i in range(len(words_b) - 1):
|
|
phrases_b.add(f"{words_b[i]} {words_b[i+1]}")
|
|
|
|
# Extract 3-word phrases
|
|
for i in range(len(words_a) - 2):
|
|
phrases_a.add(f"{words_a[i]} {words_a[i+1]} {words_a[i+2]}")
|
|
|
|
for i in range(len(words_b) - 2):
|
|
phrases_b.add(f"{words_b[i]} {words_b[i+1]} {words_b[i+2]}")
|
|
|
|
if not phrases_a or not phrases_b:
|
|
return 0.0
|
|
|
|
intersection = len(phrases_a.intersection(phrases_b))
|
|
union = len(phrases_a.union(phrases_b))
|
|
|
|
return intersection / union if union > 0 else 0.0
|
|
|
|
def _calculate_temporal_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
|
"""Calculate temporal similarity between incidents"""
|
|
created_a = incident_a.get('created_at')
|
|
created_b = incident_b.get('created_at')
|
|
|
|
if not created_a or not created_b:
|
|
return 0.0
|
|
|
|
# Convert to datetime if needed
|
|
if isinstance(created_a, str):
|
|
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
|
|
if isinstance(created_b, str):
|
|
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
|
|
|
|
time_diff = abs((created_a - created_b).total_seconds())
|
|
|
|
# Calculate similarity based on time difference
|
|
# Incidents within 1 hour: high similarity
|
|
# Incidents within 24 hours: medium similarity
|
|
# Incidents within 7 days: low similarity
|
|
if time_diff <= 3600: # 1 hour
|
|
return 1.0
|
|
elif time_diff <= 86400: # 24 hours
|
|
return 0.7
|
|
elif time_diff <= 604800: # 7 days
|
|
return 0.3
|
|
else:
|
|
return 0.0
|
|
|
|
def _calculate_service_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
|
"""Calculate service/component similarity"""
|
|
# Extract service/component information from text
|
|
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
|
|
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
|
|
|
|
# Common service/component keywords
|
|
service_keywords = [
|
|
'api', 'service', 'database', 'server', 'application', 'website', 'mobile',
|
|
'frontend', 'backend', 'microservice', 'gateway', 'load balancer', 'cache',
|
|
'queue', 'message', 'notification', 'email', 'sms', 'payment', 'auth'
|
|
]
|
|
|
|
services_a = set()
|
|
services_b = set()
|
|
|
|
for keyword in service_keywords:
|
|
if keyword in text_a:
|
|
services_a.add(keyword)
|
|
if keyword in text_b:
|
|
services_b.add(keyword)
|
|
|
|
if not services_a or not services_b:
|
|
return 0.0
|
|
|
|
intersection = len(services_a.intersection(services_b))
|
|
union = len(services_a.union(services_b))
|
|
|
|
return intersection / union if union > 0 else 0.0
|
|
|
|
def _calculate_category_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
|
"""Calculate category similarity"""
|
|
category_a = incident_a.get('category', '')
|
|
category_b = incident_b.get('category', '')
|
|
|
|
if not category_a or not category_b:
|
|
return 0.0
|
|
|
|
if category_a == category_b:
|
|
return 1.0
|
|
|
|
# Check for related categories
|
|
related_categories = {
|
|
'INFRASTRUCTURE': ['APPLICATION', 'SECURITY'],
|
|
'APPLICATION': ['INFRASTRUCTURE', 'USER_EXPERIENCE'],
|
|
'SECURITY': ['INFRASTRUCTURE', 'APPLICATION'],
|
|
'USER_EXPERIENCE': ['APPLICATION', 'DATA'],
|
|
'DATA': ['USER_EXPERIENCE', 'INTEGRATION'],
|
|
'INTEGRATION': ['DATA', 'APPLICATION']
|
|
}
|
|
|
|
if category_b in related_categories.get(category_a, []):
|
|
return 0.5
|
|
|
|
return 0.0
|
|
|
|
def _determine_correlation_type(self, incident_a: Dict, incident_b: Dict,
|
|
text_similarity: float, temporal_similarity: float,
|
|
service_similarity: float) -> str:
|
|
"""Determine the type of correlation between incidents"""
|
|
|
|
# Same service correlation
|
|
if service_similarity > 0.7:
|
|
return 'SAME_SERVICE'
|
|
|
|
# Same component correlation
|
|
if text_similarity > 0.6 and service_similarity > 0.4:
|
|
return 'SAME_COMPONENT'
|
|
|
|
# Temporal correlation
|
|
if temporal_similarity > 0.7 and text_similarity > 0.3:
|
|
return 'TEMPORAL'
|
|
|
|
# Pattern match
|
|
if text_similarity > 0.5:
|
|
return 'PATTERN'
|
|
|
|
# Dependency correlation
|
|
if service_similarity > 0.4 and temporal_similarity > 0.5:
|
|
return 'DEPENDENCY'
|
|
|
|
# Cascade effect
|
|
if temporal_similarity > 0.8 and text_similarity > 0.4:
|
|
return 'CASCADE'
|
|
|
|
return 'PATTERN' # Default
|
|
|
|
def _calculate_confidence_score(self, overall_similarity: float, correlation_type: str,
|
|
incident_a: Dict, incident_b: Dict) -> float:
|
|
"""Calculate confidence score for the correlation"""
|
|
base_confidence = overall_similarity
|
|
|
|
# Adjust based on correlation type
|
|
type_adjustments = {
|
|
'SAME_SERVICE': 0.1,
|
|
'SAME_COMPONENT': 0.15,
|
|
'TEMPORAL': 0.05,
|
|
'PATTERN': 0.0,
|
|
'DEPENDENCY': 0.1,
|
|
'CASCADE': 0.2
|
|
}
|
|
|
|
base_confidence += type_adjustments.get(correlation_type, 0.0)
|
|
|
|
# Adjust based on incident characteristics
|
|
if incident_a.get('severity') == incident_b.get('severity'):
|
|
base_confidence += 0.05
|
|
|
|
if incident_a.get('status') == incident_b.get('status'):
|
|
base_confidence += 0.03
|
|
|
|
return min(base_confidence, 1.0)
|
|
|
|
def _determine_correlation_strength(self, confidence_score: float) -> str:
|
|
"""Determine correlation strength based on confidence score"""
|
|
if confidence_score >= self.correlation_thresholds['VERY_STRONG']:
|
|
return 'VERY_STRONG'
|
|
elif confidence_score >= self.correlation_thresholds['STRONG']:
|
|
return 'STRONG'
|
|
elif confidence_score >= self.correlation_thresholds['MODERATE']:
|
|
return 'MODERATE'
|
|
else:
|
|
return 'WEAK'
|
|
|
|
def _extract_shared_keywords(self, incident_a: Dict, incident_b: Dict) -> List[str]:
|
|
"""Extract keywords shared between incidents"""
|
|
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
|
|
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
|
|
|
|
keywords_a = set(self.classifier._extract_keywords(text_a))
|
|
keywords_b = set(self.classifier._extract_keywords(text_b))
|
|
|
|
shared = list(keywords_a.intersection(keywords_b))
|
|
return shared[:10] # Return top 10 shared keywords
|
|
|
|
def _calculate_time_difference(self, incident_a: Dict, incident_b: Dict) -> timedelta:
|
|
"""Calculate time difference between incidents"""
|
|
created_a = incident_a.get('created_at')
|
|
created_b = incident_b.get('created_at')
|
|
|
|
if not created_a or not created_b:
|
|
return timedelta(0)
|
|
|
|
# Convert to datetime if needed
|
|
if isinstance(created_a, str):
|
|
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
|
|
if isinstance(created_b, str):
|
|
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
|
|
|
|
return abs(created_a - created_b)
|
|
|
|
def _detect_problem_patterns(self, incident_a: Dict, incident_b: Dict,
|
|
correlation_type: str, confidence_score: float) -> Tuple[bool, Optional[str]]:
|
|
"""Detect if correlation indicates a larger problem"""
|
|
|
|
# High confidence correlations are more likely to indicate problems
|
|
if confidence_score < 0.6:
|
|
return False, None
|
|
|
|
# Check for specific problem patterns
|
|
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
|
|
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
|
|
combined_text = f"{text_a} {text_b}"
|
|
|
|
for pattern_name, pattern_data in self.problem_patterns.items():
|
|
# Check for pattern keywords
|
|
keyword_matches = sum(1 for keyword in pattern_data['keywords'] if keyword in combined_text)
|
|
|
|
if keyword_matches >= 2: # At least 2 keywords match
|
|
return True, f"Potential {pattern_name.replace('_', ' ').lower()} detected"
|
|
|
|
# Check for cascade effects
|
|
if correlation_type == 'CASCADE' and confidence_score > 0.7:
|
|
return True, "Potential cascade failure detected"
|
|
|
|
# Check for recurring issues
|
|
if correlation_type == 'SAME_SERVICE' and confidence_score > 0.8:
|
|
return True, "Potential recurring service issue detected"
|
|
|
|
return False, None
|
|
|
|
def find_related_incidents(self, target_incident: Dict, all_incidents: List[Dict],
|
|
limit: int = 10) -> List[Tuple[Dict, CorrelationResult]]:
|
|
"""Find incidents related to a target incident"""
|
|
correlations = []
|
|
|
|
for incident in all_incidents:
|
|
if incident['id'] == target_incident['id']:
|
|
continue
|
|
|
|
correlation = self.correlate_incidents(target_incident, incident)
|
|
if correlation:
|
|
correlations.append((incident, correlation))
|
|
|
|
# Sort by confidence score and return top results
|
|
correlations.sort(key=lambda x: x[1].confidence_score, reverse=True)
|
|
return correlations[:limit]
|
|
|
|
def detect_problem_clusters(self, incidents: List[Dict],
|
|
min_incidents: int = 3,
|
|
time_window: timedelta = timedelta(hours=24)) -> List[Dict]:
|
|
"""Detect clusters of related incidents that might indicate larger problems"""
|
|
clusters = []
|
|
processed_incidents = set()
|
|
|
|
for incident in incidents:
|
|
if incident['id'] in processed_incidents:
|
|
continue
|
|
|
|
# Find related incidents within time window
|
|
related_incidents = []
|
|
incident_time = incident.get('created_at')
|
|
|
|
if isinstance(incident_time, str):
|
|
incident_time = datetime.fromisoformat(incident_time.replace('Z', '+00:00'))
|
|
|
|
for other_incident in incidents:
|
|
if other_incident['id'] == incident['id'] or other_incident['id'] in processed_incidents:
|
|
continue
|
|
|
|
other_time = other_incident.get('created_at')
|
|
if isinstance(other_time, str):
|
|
other_time = datetime.fromisoformat(other_time.replace('Z', '+00:00'))
|
|
|
|
# Check if within time window
|
|
if abs((incident_time - other_time).total_seconds()) <= time_window.total_seconds():
|
|
correlation = self.correlate_incidents(incident, other_incident)
|
|
if correlation and correlation.confidence_score > 0.5:
|
|
related_incidents.append((other_incident, correlation))
|
|
|
|
# If we found enough related incidents, create a cluster
|
|
if len(related_incidents) >= min_incidents - 1: # -1 because we include the original incident
|
|
cluster = {
|
|
'incidents': [incident] + [inc[0] for inc in related_incidents],
|
|
'correlations': [inc[1] for inc in related_incidents],
|
|
'problem_type': self._classify_problem_type(incident, related_incidents),
|
|
'confidence': sum(inc[1].confidence_score for inc in related_incidents) / len(related_incidents),
|
|
'time_span': self._calculate_cluster_time_span([incident] + [inc[0] for inc in related_incidents])
|
|
}
|
|
clusters.append(cluster)
|
|
|
|
# Mark incidents as processed
|
|
processed_incidents.add(incident['id'])
|
|
for related_incident, _ in related_incidents:
|
|
processed_incidents.add(related_incident['id'])
|
|
|
|
return clusters
|
|
|
|
def _classify_problem_type(self, incident: Dict, related_incidents: List[Tuple[Dict, CorrelationResult]]) -> str:
|
|
"""Classify the type of problem based on incident cluster"""
|
|
correlation_types = [corr.correlation_type for _, corr in related_incidents]
|
|
|
|
if 'CASCADE' in correlation_types:
|
|
return 'CASCADE_FAILURE'
|
|
elif 'SAME_SERVICE' in correlation_types:
|
|
return 'SERVICE_OUTAGE'
|
|
elif 'TEMPORAL' in correlation_types:
|
|
return 'RECURRING_ISSUE'
|
|
else:
|
|
return 'PATTERN_BASED_PROBLEM'
|
|
|
|
def _calculate_cluster_time_span(self, incidents: List[Dict]) -> timedelta:
|
|
"""Calculate the time span of a cluster of incidents"""
|
|
times = []
|
|
for incident in incidents:
|
|
created_at = incident.get('created_at')
|
|
if isinstance(created_at, str):
|
|
created_at = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
|
|
times.append(created_at)
|
|
|
|
if len(times) < 2:
|
|
return timedelta(0)
|
|
|
|
return max(times) - min(times)
|