""" Correlation engine for linking related incidents and problem detection """ import time from typing import Dict, List, Tuple, Optional from dataclasses import dataclass from datetime import datetime, timedelta from django.utils import timezone from .classification import IncidentClassifier @dataclass class CorrelationResult: """Result of incident correlation analysis""" correlation_type: str confidence_score: float correlation_strength: str shared_keywords: List[str] time_difference: timedelta similarity_score: float is_problem_indicator: bool problem_description: Optional[str] class IncidentCorrelationEngine: """ AI-driven correlation engine for linking related incidents """ def __init__(self): self.model_version = "v1.0" self.classifier = IncidentClassifier() # Correlation thresholds self.correlation_thresholds = { 'VERY_STRONG': 0.9, 'STRONG': 0.7, 'MODERATE': 0.5, 'WEAK': 0.3 } # Problem detection patterns self.problem_patterns = { 'CASCADE_FAILURE': { 'keywords': ['cascade', 'chain', 'reaction', 'domino', 'ripple', 'effect'], 'time_window': timedelta(hours=2), 'min_incidents': 3 }, 'RECURRING_ISSUE': { 'keywords': ['same', 'again', 'recurring', 'repeated', 'similar', 'identical'], 'time_window': timedelta(days=7), 'min_incidents': 2 }, 'SERVICE_DEPENDENCY': { 'keywords': ['dependency', 'dependent', 'downstream', 'upstream', 'service', 'api'], 'time_window': timedelta(hours=1), 'min_incidents': 2 }, 'INFRASTRUCTURE_PATTERN': { 'keywords': ['server', 'database', 'network', 'storage', 'infrastructure'], 'time_window': timedelta(hours=4), 'min_incidents': 3 } } def correlate_incidents(self, incident_a: Dict, incident_b: Dict) -> Optional[CorrelationResult]: """ Correlate two incidents and determine if they are related """ # Calculate various similarity metrics text_similarity = self._calculate_text_similarity(incident_a, incident_b) temporal_similarity = self._calculate_temporal_similarity(incident_a, incident_b) service_similarity = self._calculate_service_similarity(incident_a, incident_b) category_similarity = self._calculate_category_similarity(incident_a, incident_b) # Calculate overall similarity score overall_similarity = ( text_similarity * 0.4 + temporal_similarity * 0.2 + service_similarity * 0.2 + category_similarity * 0.2 ) # Determine if incidents are correlated if overall_similarity < 0.3: return None # Determine correlation type correlation_type = self._determine_correlation_type( incident_a, incident_b, text_similarity, temporal_similarity, service_similarity ) # Calculate confidence score confidence_score = self._calculate_confidence_score( overall_similarity, correlation_type, incident_a, incident_b ) # Determine correlation strength correlation_strength = self._determine_correlation_strength(confidence_score) # Extract shared keywords shared_keywords = self._extract_shared_keywords(incident_a, incident_b) # Calculate time difference time_diff = self._calculate_time_difference(incident_a, incident_b) # Check for problem indicators is_problem_indicator, problem_description = self._detect_problem_patterns( incident_a, incident_b, correlation_type, confidence_score ) return CorrelationResult( correlation_type=correlation_type, confidence_score=confidence_score, correlation_strength=correlation_strength, shared_keywords=shared_keywords, time_difference=time_diff, similarity_score=overall_similarity, is_problem_indicator=is_problem_indicator, problem_description=problem_description ) def _calculate_text_similarity(self, incident_a: Dict, incident_b: Dict) -> float: """Calculate text similarity between two incidents""" # Combine text fields text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')} {incident_a.get('free_text', '')}".lower() text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')} {incident_b.get('free_text', '')}".lower() # Extract keywords keywords_a = set(self.classifier._extract_keywords(text_a)) keywords_b = set(self.classifier._extract_keywords(text_b)) if not keywords_a or not keywords_b: return 0.0 # Calculate Jaccard similarity intersection = len(keywords_a.intersection(keywords_b)) union = len(keywords_a.union(keywords_b)) jaccard_similarity = intersection / union if union > 0 else 0.0 # Also check for exact phrase matches phrase_similarity = self._calculate_phrase_similarity(text_a, text_b) # Combine similarities return (jaccard_similarity * 0.7 + phrase_similarity * 0.3) def _calculate_phrase_similarity(self, text_a: str, text_b: str) -> float: """Calculate similarity based on common phrases""" # Extract 2-3 word phrases phrases_a = set() phrases_b = set() words_a = text_a.split() words_b = text_b.split() # Extract 2-word phrases for i in range(len(words_a) - 1): phrases_a.add(f"{words_a[i]} {words_a[i+1]}") for i in range(len(words_b) - 1): phrases_b.add(f"{words_b[i]} {words_b[i+1]}") # Extract 3-word phrases for i in range(len(words_a) - 2): phrases_a.add(f"{words_a[i]} {words_a[i+1]} {words_a[i+2]}") for i in range(len(words_b) - 2): phrases_b.add(f"{words_b[i]} {words_b[i+1]} {words_b[i+2]}") if not phrases_a or not phrases_b: return 0.0 intersection = len(phrases_a.intersection(phrases_b)) union = len(phrases_a.union(phrases_b)) return intersection / union if union > 0 else 0.0 def _calculate_temporal_similarity(self, incident_a: Dict, incident_b: Dict) -> float: """Calculate temporal similarity between incidents""" created_a = incident_a.get('created_at') created_b = incident_b.get('created_at') if not created_a or not created_b: return 0.0 # Convert to datetime if needed if isinstance(created_a, str): created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00')) if isinstance(created_b, str): created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00')) time_diff = abs((created_a - created_b).total_seconds()) # Calculate similarity based on time difference # Incidents within 1 hour: high similarity # Incidents within 24 hours: medium similarity # Incidents within 7 days: low similarity if time_diff <= 3600: # 1 hour return 1.0 elif time_diff <= 86400: # 24 hours return 0.7 elif time_diff <= 604800: # 7 days return 0.3 else: return 0.0 def _calculate_service_similarity(self, incident_a: Dict, incident_b: Dict) -> float: """Calculate service/component similarity""" # Extract service/component information from text text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower() text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower() # Common service/component keywords service_keywords = [ 'api', 'service', 'database', 'server', 'application', 'website', 'mobile', 'frontend', 'backend', 'microservice', 'gateway', 'load balancer', 'cache', 'queue', 'message', 'notification', 'email', 'sms', 'payment', 'auth' ] services_a = set() services_b = set() for keyword in service_keywords: if keyword in text_a: services_a.add(keyword) if keyword in text_b: services_b.add(keyword) if not services_a or not services_b: return 0.0 intersection = len(services_a.intersection(services_b)) union = len(services_a.union(services_b)) return intersection / union if union > 0 else 0.0 def _calculate_category_similarity(self, incident_a: Dict, incident_b: Dict) -> float: """Calculate category similarity""" category_a = incident_a.get('category', '') category_b = incident_b.get('category', '') if not category_a or not category_b: return 0.0 if category_a == category_b: return 1.0 # Check for related categories related_categories = { 'INFRASTRUCTURE': ['APPLICATION', 'SECURITY'], 'APPLICATION': ['INFRASTRUCTURE', 'USER_EXPERIENCE'], 'SECURITY': ['INFRASTRUCTURE', 'APPLICATION'], 'USER_EXPERIENCE': ['APPLICATION', 'DATA'], 'DATA': ['USER_EXPERIENCE', 'INTEGRATION'], 'INTEGRATION': ['DATA', 'APPLICATION'] } if category_b in related_categories.get(category_a, []): return 0.5 return 0.0 def _determine_correlation_type(self, incident_a: Dict, incident_b: Dict, text_similarity: float, temporal_similarity: float, service_similarity: float) -> str: """Determine the type of correlation between incidents""" # Same service correlation if service_similarity > 0.7: return 'SAME_SERVICE' # Same component correlation if text_similarity > 0.6 and service_similarity > 0.4: return 'SAME_COMPONENT' # Temporal correlation if temporal_similarity > 0.7 and text_similarity > 0.3: return 'TEMPORAL' # Pattern match if text_similarity > 0.5: return 'PATTERN' # Dependency correlation if service_similarity > 0.4 and temporal_similarity > 0.5: return 'DEPENDENCY' # Cascade effect if temporal_similarity > 0.8 and text_similarity > 0.4: return 'CASCADE' return 'PATTERN' # Default def _calculate_confidence_score(self, overall_similarity: float, correlation_type: str, incident_a: Dict, incident_b: Dict) -> float: """Calculate confidence score for the correlation""" base_confidence = overall_similarity # Adjust based on correlation type type_adjustments = { 'SAME_SERVICE': 0.1, 'SAME_COMPONENT': 0.15, 'TEMPORAL': 0.05, 'PATTERN': 0.0, 'DEPENDENCY': 0.1, 'CASCADE': 0.2 } base_confidence += type_adjustments.get(correlation_type, 0.0) # Adjust based on incident characteristics if incident_a.get('severity') == incident_b.get('severity'): base_confidence += 0.05 if incident_a.get('status') == incident_b.get('status'): base_confidence += 0.03 return min(base_confidence, 1.0) def _determine_correlation_strength(self, confidence_score: float) -> str: """Determine correlation strength based on confidence score""" if confidence_score >= self.correlation_thresholds['VERY_STRONG']: return 'VERY_STRONG' elif confidence_score >= self.correlation_thresholds['STRONG']: return 'STRONG' elif confidence_score >= self.correlation_thresholds['MODERATE']: return 'MODERATE' else: return 'WEAK' def _extract_shared_keywords(self, incident_a: Dict, incident_b: Dict) -> List[str]: """Extract keywords shared between incidents""" text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower() text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower() keywords_a = set(self.classifier._extract_keywords(text_a)) keywords_b = set(self.classifier._extract_keywords(text_b)) shared = list(keywords_a.intersection(keywords_b)) return shared[:10] # Return top 10 shared keywords def _calculate_time_difference(self, incident_a: Dict, incident_b: Dict) -> timedelta: """Calculate time difference between incidents""" created_a = incident_a.get('created_at') created_b = incident_b.get('created_at') if not created_a or not created_b: return timedelta(0) # Convert to datetime if needed if isinstance(created_a, str): created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00')) if isinstance(created_b, str): created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00')) return abs(created_a - created_b) def _detect_problem_patterns(self, incident_a: Dict, incident_b: Dict, correlation_type: str, confidence_score: float) -> Tuple[bool, Optional[str]]: """Detect if correlation indicates a larger problem""" # High confidence correlations are more likely to indicate problems if confidence_score < 0.6: return False, None # Check for specific problem patterns text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower() text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower() combined_text = f"{text_a} {text_b}" for pattern_name, pattern_data in self.problem_patterns.items(): # Check for pattern keywords keyword_matches = sum(1 for keyword in pattern_data['keywords'] if keyword in combined_text) if keyword_matches >= 2: # At least 2 keywords match return True, f"Potential {pattern_name.replace('_', ' ').lower()} detected" # Check for cascade effects if correlation_type == 'CASCADE' and confidence_score > 0.7: return True, "Potential cascade failure detected" # Check for recurring issues if correlation_type == 'SAME_SERVICE' and confidence_score > 0.8: return True, "Potential recurring service issue detected" return False, None def find_related_incidents(self, target_incident: Dict, all_incidents: List[Dict], limit: int = 10) -> List[Tuple[Dict, CorrelationResult]]: """Find incidents related to a target incident""" correlations = [] for incident in all_incidents: if incident['id'] == target_incident['id']: continue correlation = self.correlate_incidents(target_incident, incident) if correlation: correlations.append((incident, correlation)) # Sort by confidence score and return top results correlations.sort(key=lambda x: x[1].confidence_score, reverse=True) return correlations[:limit] def detect_problem_clusters(self, incidents: List[Dict], min_incidents: int = 3, time_window: timedelta = timedelta(hours=24)) -> List[Dict]: """Detect clusters of related incidents that might indicate larger problems""" clusters = [] processed_incidents = set() for incident in incidents: if incident['id'] in processed_incidents: continue # Find related incidents within time window related_incidents = [] incident_time = incident.get('created_at') if isinstance(incident_time, str): incident_time = datetime.fromisoformat(incident_time.replace('Z', '+00:00')) for other_incident in incidents: if other_incident['id'] == incident['id'] or other_incident['id'] in processed_incidents: continue other_time = other_incident.get('created_at') if isinstance(other_time, str): other_time = datetime.fromisoformat(other_time.replace('Z', '+00:00')) # Check if within time window if abs((incident_time - other_time).total_seconds()) <= time_window.total_seconds(): correlation = self.correlate_incidents(incident, other_incident) if correlation and correlation.confidence_score > 0.5: related_incidents.append((other_incident, correlation)) # If we found enough related incidents, create a cluster if len(related_incidents) >= min_incidents - 1: # -1 because we include the original incident cluster = { 'incidents': [incident] + [inc[0] for inc in related_incidents], 'correlations': [inc[1] for inc in related_incidents], 'problem_type': self._classify_problem_type(incident, related_incidents), 'confidence': sum(inc[1].confidence_score for inc in related_incidents) / len(related_incidents), 'time_span': self._calculate_cluster_time_span([incident] + [inc[0] for inc in related_incidents]) } clusters.append(cluster) # Mark incidents as processed processed_incidents.add(incident['id']) for related_incident, _ in related_incidents: processed_incidents.add(related_incident['id']) return clusters def _classify_problem_type(self, incident: Dict, related_incidents: List[Tuple[Dict, CorrelationResult]]) -> str: """Classify the type of problem based on incident cluster""" correlation_types = [corr.correlation_type for _, corr in related_incidents] if 'CASCADE' in correlation_types: return 'CASCADE_FAILURE' elif 'SAME_SERVICE' in correlation_types: return 'SERVICE_OUTAGE' elif 'TEMPORAL' in correlation_types: return 'RECURRING_ISSUE' else: return 'PATTERN_BASED_PROBLEM' def _calculate_cluster_time_span(self, incidents: List[Dict]) -> timedelta: """Calculate the time span of a cluster of incidents""" times = [] for incident in incidents: created_at = incident.get('created_at') if isinstance(created_at, str): created_at = datetime.fromisoformat(created_at.replace('Z', '+00:00')) times.append(created_at) if len(times) < 2: return timedelta(0) return max(times) - min(times)