This commit is contained in:
Iliyan Angelov
2025-09-19 11:58:53 +03:00
parent 306b20e24a
commit 6b247e5b9f
11423 changed files with 1500615 additions and 778 deletions

View File

@@ -0,0 +1 @@
# AI components for incident intelligence

View File

@@ -0,0 +1,471 @@
"""
AI-driven incident classification using NLP techniques
"""
import re
import time
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from django.conf import settings
@dataclass
class ClassificationResult:
"""Result of incident classification"""
category: str
subcategory: str
confidence: float
alternative_categories: List[Dict[str, float]]
keywords: List[str]
sentiment_score: float
urgency_indicators: List[str]
class IncidentClassifier:
"""
AI-driven incident classifier using rule-based and ML techniques
"""
def __init__(self):
self.model_version = "v1.0"
# Predefined categories and their keywords
self.categories = {
'INFRASTRUCTURE': {
'keywords': ['server', 'database', 'network', 'storage', 'disk', 'memory', 'cpu', 'load', 'bandwidth', 'connection', 'timeout', 'latency'],
'subcategories': {
'SERVER_ISSUE': ['server', 'host', 'machine', 'instance', 'vm', 'container'],
'DATABASE_ISSUE': ['database', 'db', 'sql', 'query', 'connection', 'timeout', 'deadlock'],
'NETWORK_ISSUE': ['network', 'connectivity', 'dns', 'firewall', 'routing', 'packet', 'bandwidth'],
'STORAGE_ISSUE': ['storage', 'disk', 'volume', 'space', 'capacity', 'i/o', 'read', 'write'],
}
},
'APPLICATION': {
'keywords': ['application', 'app', 'service', 'api', 'endpoint', 'response', 'error', 'exception', 'crash', 'bug'],
'subcategories': {
'API_ISSUE': ['api', 'endpoint', 'response', 'status', 'code', 'timeout', 'rate', 'limit'],
'SERVICE_ISSUE': ['service', 'microservice', 'dependency', 'circuit', 'breaker', 'fallback'],
'PERFORMANCE_ISSUE': ['performance', 'slow', 'latency', 'response', 'time', 'throughput', 'bottleneck'],
'FUNCTIONALITY_ISSUE': ['bug', 'feature', 'functionality', 'behavior', 'unexpected', 'incorrect'],
}
},
'SECURITY': {
'keywords': ['security', 'authentication', 'authorization', 'access', 'permission', 'breach', 'attack', 'vulnerability', 'malware'],
'subcategories': {
'AUTH_ISSUE': ['authentication', 'login', 'password', 'token', 'session', 'credential'],
'ACCESS_ISSUE': ['authorization', 'permission', 'access', 'denied', 'forbidden', 'unauthorized'],
'THREAT_ISSUE': ['attack', 'breach', 'malware', 'virus', 'intrusion', 'suspicious', 'anomaly'],
'VULNERABILITY': ['vulnerability', 'exploit', 'patch', 'update', 'security', 'fix'],
}
},
'USER_EXPERIENCE': {
'keywords': ['user', 'interface', 'ui', 'ux', 'experience', 'usability', 'navigation', 'button', 'form', 'page'],
'subcategories': {
'UI_ISSUE': ['interface', 'ui', 'button', 'form', 'page', 'layout', 'display', 'rendering'],
'NAVIGATION_ISSUE': ['navigation', 'menu', 'link', 'redirect', 'routing', 'page', 'not', 'found'],
'USABILITY_ISSUE': ['usability', 'experience', 'confusing', 'difficult', 'unclear', 'intuitive'],
'MOBILE_ISSUE': ['mobile', 'app', 'responsive', 'device', 'screen', 'touch', 'gesture'],
}
},
'DATA': {
'keywords': ['data', 'file', 'import', 'export', 'sync', 'backup', 'recovery', 'corruption', 'missing', 'duplicate'],
'subcategories': {
'DATA_CORRUPTION': ['corruption', 'corrupted', 'invalid', 'malformed', 'broken', 'damaged'],
'DATA_LOSS': ['missing', 'lost', 'deleted', 'removed', 'disappeared', 'not', 'found'],
'SYNC_ISSUE': ['sync', 'synchronization', 'conflict', 'merge', 'update', 'latest'],
'BACKUP_ISSUE': ['backup', 'restore', 'recovery', 'archive', 'retention', 'storage'],
}
},
'INTEGRATION': {
'keywords': ['integration', 'third-party', 'external', 'webhook', 'api', 'connection', 'sync', 'import', 'export'],
'subcategories': {
'THIRD_PARTY_ISSUE': ['third-party', 'external', 'vendor', 'partner', 'service', 'provider'],
'WEBHOOK_ISSUE': ['webhook', 'callback', 'notification', 'event', 'trigger', 'delivery'],
'API_INTEGRATION': ['api', 'integration', 'endpoint', 'connection', 'authentication', 'response'],
'DATA_INTEGRATION': ['import', 'export', 'migration', 'transformation', 'mapping', 'format'],
}
}
}
# Urgency indicators
self.urgency_indicators = {
'CRITICAL': ['down', 'outage', 'critical', 'emergency', 'urgent', 'immediate', 'severe', 'complete', 'total'],
'HIGH': ['major', 'significant', 'important', 'priority', 'escalate', 'escalated', 'blocking'],
'MEDIUM': ['moderate', 'some', 'partial', 'intermittent', 'occasional', 'sometimes'],
'LOW': ['minor', 'small', 'cosmetic', 'enhancement', 'improvement', 'suggestion']
}
# Sentiment analysis keywords
self.sentiment_keywords = {
'positive': ['working', 'fixed', 'resolved', 'good', 'excellent', 'improved', 'better', 'success'],
'negative': ['broken', 'failed', 'error', 'issue', 'problem', 'bug', 'crash', 'down', 'slow', 'terrible', 'awful'],
'neutral': ['report', 'incident', 'ticket', 'request', 'update', 'status', 'information']
}
def classify_incident(self, title: str, description: str, free_text: str = "") -> ClassificationResult:
"""
Classify an incident based on its text content
"""
start_time = time.time()
# Combine all text for analysis
combined_text = f"{title} {description} {free_text}".lower()
# Extract keywords
keywords = self._extract_keywords(combined_text)
# Analyze sentiment
sentiment_score = self._analyze_sentiment(combined_text)
# Detect urgency indicators
urgency_indicators = self._detect_urgency_indicators(combined_text)
# Classify category and subcategory
category, subcategory, confidence, alternatives = self._classify_category(combined_text, keywords)
processing_time = time.time() - start_time
return ClassificationResult(
category=category,
subcategory=subcategory,
confidence=confidence,
alternative_categories=alternatives,
keywords=keywords,
sentiment_score=sentiment_score,
urgency_indicators=urgency_indicators
)
def _extract_keywords(self, text: str) -> List[str]:
"""Extract relevant keywords from text"""
# Simple keyword extraction - in production, use more sophisticated NLP
words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
# Filter out common stop words
stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'a', 'an'}
keywords = [word for word in words if word not in stop_words]
# Count frequency and return top keywords
from collections import Counter
keyword_counts = Counter(keywords)
return [word for word, count in keyword_counts.most_common(10)]
def _analyze_sentiment(self, text: str) -> float:
"""Analyze sentiment of the text (-1 to 1)"""
positive_count = sum(1 for word in self.sentiment_keywords['positive'] if word in text)
negative_count = sum(1 for word in self.sentiment_keywords['negative'] if word in text)
total_sentiment_words = positive_count + negative_count
if total_sentiment_words == 0:
return 0.0
return (positive_count - negative_count) / total_sentiment_words
def _detect_urgency_indicators(self, text: str) -> List[str]:
"""Detect urgency indicators in the text"""
detected_indicators = []
for urgency_level, indicators in self.urgency_indicators.items():
for indicator in indicators:
if indicator in text:
detected_indicators.append(f"{urgency_level}: {indicator}")
return detected_indicators
def _classify_category(self, text: str, keywords: List[str]) -> Tuple[str, str, float, List[Dict[str, float]]]:
"""Classify the incident category and subcategory"""
category_scores = {}
subcategory_scores = {}
# Score each category based on keyword matches
for category, data in self.categories.items():
score = 0
category_keywords = data['keywords']
# Count keyword matches
for keyword in category_keywords:
if keyword in text:
score += 1
# Also check for partial matches in keywords list
for extracted_keyword in keywords:
if keyword in extracted_keyword or extracted_keyword in keyword:
score += 0.5
category_scores[category] = score
# Score subcategories
for subcategory, subcategory_keywords in data['subcategories'].items():
subcategory_score = 0
for keyword in subcategory_keywords:
if keyword in text:
subcategory_score += 1
for extracted_keyword in keywords:
if keyword in extracted_keyword or extracted_keyword in keyword:
subcategory_score += 0.5
subcategory_scores[subcategory] = subcategory_score
# Find best category
if not category_scores or max(category_scores.values()) == 0:
best_category = 'GENERAL'
best_subcategory = 'UNKNOWN'
confidence = 0.1
else:
best_category = max(category_scores, key=category_scores.get)
max_score = max(category_scores.values())
confidence = min(max_score / 10.0, 1.0) # Normalize to 0-1
# Find best subcategory within the category
if best_category in self.categories:
category_subcategories = self.categories[best_category]['subcategories']
subcategory_scores_filtered = {k: v for k, v in subcategory_scores.items() if k in category_subcategories}
if subcategory_scores_filtered and max(subcategory_scores_filtered.values()) > 0:
best_subcategory = max(subcategory_scores_filtered, key=subcategory_scores_filtered.get)
else:
best_subcategory = 'GENERAL'
else:
best_subcategory = 'GENERAL'
# Create alternative categories
alternatives = []
sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
for category, score in sorted_categories[:3]:
if category != best_category and score > 0:
alternatives.append({
'category': category,
'confidence': min(score / 10.0, 1.0)
})
return best_category, best_subcategory, confidence, alternatives
class SeverityAnalyzer:
"""
AI-driven severity analyzer based on impact assessment
"""
def __init__(self):
self.model_version = "v1.0"
# Severity indicators
self.severity_indicators = {
'EMERGENCY': {
'keywords': ['down', 'outage', 'critical', 'emergency', 'complete', 'total', 'all', 'entire', 'system'],
'impact_multiplier': 2.0,
'user_impact_threshold': 0.8,
'business_impact_threshold': 0.9
},
'CRITICAL': {
'keywords': ['major', 'significant', 'severe', 'blocking', 'cannot', 'unable', 'failed', 'broken'],
'impact_multiplier': 1.5,
'user_impact_threshold': 0.6,
'business_impact_threshold': 0.7
},
'HIGH': {
'keywords': ['important', 'priority', 'escalate', 'escalated', 'urgent', 'immediate', 'soon'],
'impact_multiplier': 1.2,
'user_impact_threshold': 0.4,
'business_impact_threshold': 0.5
},
'MEDIUM': {
'keywords': ['moderate', 'some', 'partial', 'intermittent', 'occasional', 'sometimes', 'minor'],
'impact_multiplier': 1.0,
'user_impact_threshold': 0.2,
'business_impact_threshold': 0.3
},
'LOW': {
'keywords': ['small', 'cosmetic', 'enhancement', 'improvement', 'suggestion', 'nice', 'to', 'have'],
'impact_multiplier': 0.5,
'user_impact_threshold': 0.1,
'business_impact_threshold': 0.1
}
}
def analyze_severity(self, incident_data: Dict) -> Dict:
"""
Analyze incident severity based on various factors
"""
start_time = time.time()
title = incident_data.get('title', '').lower()
description = incident_data.get('description', '').lower()
free_text = incident_data.get('free_text', '').lower()
affected_users = incident_data.get('affected_users', 0)
business_impact = incident_data.get('business_impact', '').lower()
combined_text = f"{title} {description} {free_text} {business_impact}"
# Calculate impact scores
user_impact_score = self._calculate_user_impact(affected_users, combined_text)
business_impact_score = self._calculate_business_impact(business_impact, combined_text)
technical_impact_score = self._calculate_technical_impact(combined_text)
# Determine severity based on impact scores and keywords
suggested_severity, confidence, reasoning, impact_factors = self._determine_severity(
combined_text, user_impact_score, business_impact_score, technical_impact_score
)
processing_time = time.time() - start_time
return {
'suggested_severity': suggested_severity,
'confidence_score': confidence,
'user_impact_score': user_impact_score,
'business_impact_score': business_impact_score,
'technical_impact_score': technical_impact_score,
'reasoning': reasoning,
'impact_factors': impact_factors,
'processing_time': processing_time
}
def _calculate_user_impact(self, affected_users: int, text: str) -> float:
"""Calculate user impact score (0-1)"""
# Base score from affected users count
if affected_users == 0:
# Try to extract from text
user_indicators = ['all users', 'everyone', 'entire user base', 'all customers']
if any(indicator in text for indicator in user_indicators):
base_score = 0.9
else:
base_score = 0.1
elif affected_users < 10:
base_score = 0.2
elif affected_users < 100:
base_score = 0.4
elif affected_users < 1000:
base_score = 0.6
elif affected_users < 10000:
base_score = 0.8
else:
base_score = 1.0
# Adjust based on text indicators
if 'all' in text or 'everyone' in text:
base_score = min(base_score + 0.2, 1.0)
elif 'some' in text or 'few' in text:
base_score = max(base_score - 0.1, 0.0)
return base_score
def _calculate_business_impact(self, business_impact: str, text: str) -> float:
"""Calculate business impact score (0-1)"""
if not business_impact:
# Try to infer from text
high_impact_indicators = ['revenue', 'sales', 'customer', 'business', 'critical', 'essential', 'production']
if any(indicator in text for indicator in high_impact_indicators):
return 0.6
return 0.3
# Analyze business impact text
high_impact_keywords = ['revenue', 'sales', 'customer', 'business', 'critical', 'essential', 'production', 'outage', 'down']
medium_impact_keywords = ['service', 'feature', 'functionality', 'performance', 'slow']
low_impact_keywords = ['cosmetic', 'minor', 'enhancement', 'improvement']
score = 0.3 # Base score
for keyword in high_impact_keywords:
if keyword in business_impact:
score += 0.1
for keyword in medium_impact_keywords:
if keyword in business_impact:
score += 0.05
for keyword in low_impact_keywords:
if keyword in business_impact:
score -= 0.05
return min(max(score, 0.0), 1.0)
def _calculate_technical_impact(self, text: str) -> float:
"""Calculate technical impact score (0-1)"""
technical_indicators = {
'high': ['down', 'outage', 'crash', 'failed', 'broken', 'unavailable', 'error', 'exception'],
'medium': ['slow', 'performance', 'latency', 'timeout', 'intermittent', 'partial'],
'low': ['cosmetic', 'display', 'ui', 'minor', 'enhancement']
}
score = 0.3 # Base score
for level, keywords in technical_indicators.items():
for keyword in keywords:
if keyword in text:
if level == 'high':
score += 0.15
elif level == 'medium':
score += 0.08
elif level == 'low':
score -= 0.05
return min(max(score, 0.0), 1.0)
def _determine_severity(self, text: str, user_impact: float, business_impact: float, technical_impact: float) -> Tuple[str, float, str, List[str]]:
"""Determine severity based on impact scores and text analysis"""
impact_factors = []
# Calculate weighted impact score
weighted_score = (user_impact * 0.4 + business_impact * 0.4 + technical_impact * 0.2)
# Check for severity indicators in text
severity_scores = {}
for severity, data in self.severity_indicators.items():
score = 0
for keyword in data['keywords']:
if keyword in text:
score += 1
# Apply impact multiplier
score *= data['impact_multiplier']
severity_scores[severity] = score
# Find best severity match
if severity_scores and max(severity_scores.values()) > 0:
best_severity = max(severity_scores, key=severity_scores.get)
text_confidence = min(max(severity_scores.values()) / 5.0, 1.0)
else:
# Fallback to impact-based severity
if weighted_score >= 0.8:
best_severity = 'CRITICAL'
elif weighted_score >= 0.6:
best_severity = 'HIGH'
elif weighted_score >= 0.4:
best_severity = 'MEDIUM'
else:
best_severity = 'LOW'
text_confidence = 0.5
# Combine text and impact confidence
confidence = (text_confidence + (1.0 - abs(weighted_score - self._severity_to_score(best_severity)))) / 2.0
# Generate reasoning
reasoning_parts = []
if user_impact > 0.6:
reasoning_parts.append(f"High user impact ({user_impact:.1%})")
impact_factors.append(f"User Impact: {user_impact:.1%}")
if business_impact > 0.6:
reasoning_parts.append(f"Significant business impact ({business_impact:.1%})")
impact_factors.append(f"Business Impact: {business_impact:.1%}")
if technical_impact > 0.6:
reasoning_parts.append(f"Major technical impact ({technical_impact:.1%})")
impact_factors.append(f"Technical Impact: {technical_impact:.1%}")
if severity_scores and max(severity_scores.values()) > 0:
reasoning_parts.append("Severity indicators detected in incident description")
impact_factors.append("Text Analysis: Severity keywords found")
reasoning = "; ".join(reasoning_parts) if reasoning_parts else "Based on overall impact assessment"
return best_severity, confidence, reasoning, impact_factors
def _severity_to_score(self, severity: str) -> float:
"""Convert severity level to numeric score"""
severity_scores = {
'LOW': 0.2,
'MEDIUM': 0.4,
'HIGH': 0.6,
'CRITICAL': 0.8,
'EMERGENCY': 1.0
}
return severity_scores.get(severity, 0.4)

View File

@@ -0,0 +1,481 @@
"""
Correlation engine for linking related incidents and problem detection
"""
import time
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime, timedelta
from django.utils import timezone
from .classification import IncidentClassifier
@dataclass
class CorrelationResult:
"""Result of incident correlation analysis"""
correlation_type: str
confidence_score: float
correlation_strength: str
shared_keywords: List[str]
time_difference: timedelta
similarity_score: float
is_problem_indicator: bool
problem_description: Optional[str]
class IncidentCorrelationEngine:
"""
AI-driven correlation engine for linking related incidents
"""
def __init__(self):
self.model_version = "v1.0"
self.classifier = IncidentClassifier()
# Correlation thresholds
self.correlation_thresholds = {
'VERY_STRONG': 0.9,
'STRONG': 0.7,
'MODERATE': 0.5,
'WEAK': 0.3
}
# Problem detection patterns
self.problem_patterns = {
'CASCADE_FAILURE': {
'keywords': ['cascade', 'chain', 'reaction', 'domino', 'ripple', 'effect'],
'time_window': timedelta(hours=2),
'min_incidents': 3
},
'RECURRING_ISSUE': {
'keywords': ['same', 'again', 'recurring', 'repeated', 'similar', 'identical'],
'time_window': timedelta(days=7),
'min_incidents': 2
},
'SERVICE_DEPENDENCY': {
'keywords': ['dependency', 'dependent', 'downstream', 'upstream', 'service', 'api'],
'time_window': timedelta(hours=1),
'min_incidents': 2
},
'INFRASTRUCTURE_PATTERN': {
'keywords': ['server', 'database', 'network', 'storage', 'infrastructure'],
'time_window': timedelta(hours=4),
'min_incidents': 3
}
}
def correlate_incidents(self, incident_a: Dict, incident_b: Dict) -> Optional[CorrelationResult]:
"""
Correlate two incidents and determine if they are related
"""
# Calculate various similarity metrics
text_similarity = self._calculate_text_similarity(incident_a, incident_b)
temporal_similarity = self._calculate_temporal_similarity(incident_a, incident_b)
service_similarity = self._calculate_service_similarity(incident_a, incident_b)
category_similarity = self._calculate_category_similarity(incident_a, incident_b)
# Calculate overall similarity score
overall_similarity = (
text_similarity * 0.4 +
temporal_similarity * 0.2 +
service_similarity * 0.2 +
category_similarity * 0.2
)
# Determine if incidents are correlated
if overall_similarity < 0.3:
return None
# Determine correlation type
correlation_type = self._determine_correlation_type(
incident_a, incident_b, text_similarity, temporal_similarity, service_similarity
)
# Calculate confidence score
confidence_score = self._calculate_confidence_score(
overall_similarity, correlation_type, incident_a, incident_b
)
# Determine correlation strength
correlation_strength = self._determine_correlation_strength(confidence_score)
# Extract shared keywords
shared_keywords = self._extract_shared_keywords(incident_a, incident_b)
# Calculate time difference
time_diff = self._calculate_time_difference(incident_a, incident_b)
# Check for problem indicators
is_problem_indicator, problem_description = self._detect_problem_patterns(
incident_a, incident_b, correlation_type, confidence_score
)
return CorrelationResult(
correlation_type=correlation_type,
confidence_score=confidence_score,
correlation_strength=correlation_strength,
shared_keywords=shared_keywords,
time_difference=time_diff,
similarity_score=overall_similarity,
is_problem_indicator=is_problem_indicator,
problem_description=problem_description
)
def _calculate_text_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate text similarity between two incidents"""
# Combine text fields
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')} {incident_a.get('free_text', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')} {incident_b.get('free_text', '')}".lower()
# Extract keywords
keywords_a = set(self.classifier._extract_keywords(text_a))
keywords_b = set(self.classifier._extract_keywords(text_b))
if not keywords_a or not keywords_b:
return 0.0
# Calculate Jaccard similarity
intersection = len(keywords_a.intersection(keywords_b))
union = len(keywords_a.union(keywords_b))
jaccard_similarity = intersection / union if union > 0 else 0.0
# Also check for exact phrase matches
phrase_similarity = self._calculate_phrase_similarity(text_a, text_b)
# Combine similarities
return (jaccard_similarity * 0.7 + phrase_similarity * 0.3)
def _calculate_phrase_similarity(self, text_a: str, text_b: str) -> float:
"""Calculate similarity based on common phrases"""
# Extract 2-3 word phrases
phrases_a = set()
phrases_b = set()
words_a = text_a.split()
words_b = text_b.split()
# Extract 2-word phrases
for i in range(len(words_a) - 1):
phrases_a.add(f"{words_a[i]} {words_a[i+1]}")
for i in range(len(words_b) - 1):
phrases_b.add(f"{words_b[i]} {words_b[i+1]}")
# Extract 3-word phrases
for i in range(len(words_a) - 2):
phrases_a.add(f"{words_a[i]} {words_a[i+1]} {words_a[i+2]}")
for i in range(len(words_b) - 2):
phrases_b.add(f"{words_b[i]} {words_b[i+1]} {words_b[i+2]}")
if not phrases_a or not phrases_b:
return 0.0
intersection = len(phrases_a.intersection(phrases_b))
union = len(phrases_a.union(phrases_b))
return intersection / union if union > 0 else 0.0
def _calculate_temporal_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate temporal similarity between incidents"""
created_a = incident_a.get('created_at')
created_b = incident_b.get('created_at')
if not created_a or not created_b:
return 0.0
# Convert to datetime if needed
if isinstance(created_a, str):
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
if isinstance(created_b, str):
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
time_diff = abs((created_a - created_b).total_seconds())
# Calculate similarity based on time difference
# Incidents within 1 hour: high similarity
# Incidents within 24 hours: medium similarity
# Incidents within 7 days: low similarity
if time_diff <= 3600: # 1 hour
return 1.0
elif time_diff <= 86400: # 24 hours
return 0.7
elif time_diff <= 604800: # 7 days
return 0.3
else:
return 0.0
def _calculate_service_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate service/component similarity"""
# Extract service/component information from text
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
# Common service/component keywords
service_keywords = [
'api', 'service', 'database', 'server', 'application', 'website', 'mobile',
'frontend', 'backend', 'microservice', 'gateway', 'load balancer', 'cache',
'queue', 'message', 'notification', 'email', 'sms', 'payment', 'auth'
]
services_a = set()
services_b = set()
for keyword in service_keywords:
if keyword in text_a:
services_a.add(keyword)
if keyword in text_b:
services_b.add(keyword)
if not services_a or not services_b:
return 0.0
intersection = len(services_a.intersection(services_b))
union = len(services_a.union(services_b))
return intersection / union if union > 0 else 0.0
def _calculate_category_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate category similarity"""
category_a = incident_a.get('category', '')
category_b = incident_b.get('category', '')
if not category_a or not category_b:
return 0.0
if category_a == category_b:
return 1.0
# Check for related categories
related_categories = {
'INFRASTRUCTURE': ['APPLICATION', 'SECURITY'],
'APPLICATION': ['INFRASTRUCTURE', 'USER_EXPERIENCE'],
'SECURITY': ['INFRASTRUCTURE', 'APPLICATION'],
'USER_EXPERIENCE': ['APPLICATION', 'DATA'],
'DATA': ['USER_EXPERIENCE', 'INTEGRATION'],
'INTEGRATION': ['DATA', 'APPLICATION']
}
if category_b in related_categories.get(category_a, []):
return 0.5
return 0.0
def _determine_correlation_type(self, incident_a: Dict, incident_b: Dict,
text_similarity: float, temporal_similarity: float,
service_similarity: float) -> str:
"""Determine the type of correlation between incidents"""
# Same service correlation
if service_similarity > 0.7:
return 'SAME_SERVICE'
# Same component correlation
if text_similarity > 0.6 and service_similarity > 0.4:
return 'SAME_COMPONENT'
# Temporal correlation
if temporal_similarity > 0.7 and text_similarity > 0.3:
return 'TEMPORAL'
# Pattern match
if text_similarity > 0.5:
return 'PATTERN'
# Dependency correlation
if service_similarity > 0.4 and temporal_similarity > 0.5:
return 'DEPENDENCY'
# Cascade effect
if temporal_similarity > 0.8 and text_similarity > 0.4:
return 'CASCADE'
return 'PATTERN' # Default
def _calculate_confidence_score(self, overall_similarity: float, correlation_type: str,
incident_a: Dict, incident_b: Dict) -> float:
"""Calculate confidence score for the correlation"""
base_confidence = overall_similarity
# Adjust based on correlation type
type_adjustments = {
'SAME_SERVICE': 0.1,
'SAME_COMPONENT': 0.15,
'TEMPORAL': 0.05,
'PATTERN': 0.0,
'DEPENDENCY': 0.1,
'CASCADE': 0.2
}
base_confidence += type_adjustments.get(correlation_type, 0.0)
# Adjust based on incident characteristics
if incident_a.get('severity') == incident_b.get('severity'):
base_confidence += 0.05
if incident_a.get('status') == incident_b.get('status'):
base_confidence += 0.03
return min(base_confidence, 1.0)
def _determine_correlation_strength(self, confidence_score: float) -> str:
"""Determine correlation strength based on confidence score"""
if confidence_score >= self.correlation_thresholds['VERY_STRONG']:
return 'VERY_STRONG'
elif confidence_score >= self.correlation_thresholds['STRONG']:
return 'STRONG'
elif confidence_score >= self.correlation_thresholds['MODERATE']:
return 'MODERATE'
else:
return 'WEAK'
def _extract_shared_keywords(self, incident_a: Dict, incident_b: Dict) -> List[str]:
"""Extract keywords shared between incidents"""
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
keywords_a = set(self.classifier._extract_keywords(text_a))
keywords_b = set(self.classifier._extract_keywords(text_b))
shared = list(keywords_a.intersection(keywords_b))
return shared[:10] # Return top 10 shared keywords
def _calculate_time_difference(self, incident_a: Dict, incident_b: Dict) -> timedelta:
"""Calculate time difference between incidents"""
created_a = incident_a.get('created_at')
created_b = incident_b.get('created_at')
if not created_a or not created_b:
return timedelta(0)
# Convert to datetime if needed
if isinstance(created_a, str):
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
if isinstance(created_b, str):
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
return abs(created_a - created_b)
def _detect_problem_patterns(self, incident_a: Dict, incident_b: Dict,
correlation_type: str, confidence_score: float) -> Tuple[bool, Optional[str]]:
"""Detect if correlation indicates a larger problem"""
# High confidence correlations are more likely to indicate problems
if confidence_score < 0.6:
return False, None
# Check for specific problem patterns
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
combined_text = f"{text_a} {text_b}"
for pattern_name, pattern_data in self.problem_patterns.items():
# Check for pattern keywords
keyword_matches = sum(1 for keyword in pattern_data['keywords'] if keyword in combined_text)
if keyword_matches >= 2: # At least 2 keywords match
return True, f"Potential {pattern_name.replace('_', ' ').lower()} detected"
# Check for cascade effects
if correlation_type == 'CASCADE' and confidence_score > 0.7:
return True, "Potential cascade failure detected"
# Check for recurring issues
if correlation_type == 'SAME_SERVICE' and confidence_score > 0.8:
return True, "Potential recurring service issue detected"
return False, None
def find_related_incidents(self, target_incident: Dict, all_incidents: List[Dict],
limit: int = 10) -> List[Tuple[Dict, CorrelationResult]]:
"""Find incidents related to a target incident"""
correlations = []
for incident in all_incidents:
if incident['id'] == target_incident['id']:
continue
correlation = self.correlate_incidents(target_incident, incident)
if correlation:
correlations.append((incident, correlation))
# Sort by confidence score and return top results
correlations.sort(key=lambda x: x[1].confidence_score, reverse=True)
return correlations[:limit]
def detect_problem_clusters(self, incidents: List[Dict],
min_incidents: int = 3,
time_window: timedelta = timedelta(hours=24)) -> List[Dict]:
"""Detect clusters of related incidents that might indicate larger problems"""
clusters = []
processed_incidents = set()
for incident in incidents:
if incident['id'] in processed_incidents:
continue
# Find related incidents within time window
related_incidents = []
incident_time = incident.get('created_at')
if isinstance(incident_time, str):
incident_time = datetime.fromisoformat(incident_time.replace('Z', '+00:00'))
for other_incident in incidents:
if other_incident['id'] == incident['id'] or other_incident['id'] in processed_incidents:
continue
other_time = other_incident.get('created_at')
if isinstance(other_time, str):
other_time = datetime.fromisoformat(other_time.replace('Z', '+00:00'))
# Check if within time window
if abs((incident_time - other_time).total_seconds()) <= time_window.total_seconds():
correlation = self.correlate_incidents(incident, other_incident)
if correlation and correlation.confidence_score > 0.5:
related_incidents.append((other_incident, correlation))
# If we found enough related incidents, create a cluster
if len(related_incidents) >= min_incidents - 1: # -1 because we include the original incident
cluster = {
'incidents': [incident] + [inc[0] for inc in related_incidents],
'correlations': [inc[1] for inc in related_incidents],
'problem_type': self._classify_problem_type(incident, related_incidents),
'confidence': sum(inc[1].confidence_score for inc in related_incidents) / len(related_incidents),
'time_span': self._calculate_cluster_time_span([incident] + [inc[0] for inc in related_incidents])
}
clusters.append(cluster)
# Mark incidents as processed
processed_incidents.add(incident['id'])
for related_incident, _ in related_incidents:
processed_incidents.add(related_incident['id'])
return clusters
def _classify_problem_type(self, incident: Dict, related_incidents: List[Tuple[Dict, CorrelationResult]]) -> str:
"""Classify the type of problem based on incident cluster"""
correlation_types = [corr.correlation_type for _, corr in related_incidents]
if 'CASCADE' in correlation_types:
return 'CASCADE_FAILURE'
elif 'SAME_SERVICE' in correlation_types:
return 'SERVICE_OUTAGE'
elif 'TEMPORAL' in correlation_types:
return 'RECURRING_ISSUE'
else:
return 'PATTERN_BASED_PROBLEM'
def _calculate_cluster_time_span(self, incidents: List[Dict]) -> timedelta:
"""Calculate the time span of a cluster of incidents"""
times = []
for incident in incidents:
created_at = incident.get('created_at')
if isinstance(created_at, str):
created_at = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
times.append(created_at)
if len(times) < 2:
return timedelta(0)
return max(times) - min(times)

View File

@@ -0,0 +1,516 @@
"""
Duplication detection engine for identifying and merging duplicate incidents
"""
import time
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime, timedelta
from .classification import IncidentClassifier
@dataclass
class DuplicationResult:
"""Result of duplication detection analysis"""
duplication_type: str
similarity_score: float
confidence_score: float
text_similarity: float
temporal_proximity: float
service_similarity: float
recommended_action: str
merge_confidence: float
reasoning: str
shared_elements: List[str]
class DuplicationDetector:
"""
AI-driven duplication detector for identifying duplicate incidents
"""
def __init__(self):
self.model_version = "v1.0"
self.classifier = IncidentClassifier()
# Duplication thresholds
self.duplication_thresholds = {
'EXACT': 0.95,
'NEAR_DUPLICATE': 0.85,
'SIMILAR': 0.70,
'POTENTIAL_DUPLICATE': 0.50
}
# Action thresholds
self.action_thresholds = {
'MERGE': 0.90,
'LINK': 0.75,
'REVIEW': 0.60,
'NO_ACTION': 0.0
}
# Time windows for temporal proximity
self.time_windows = {
'EXACT': timedelta(minutes=30),
'NEAR_DUPLICATE': timedelta(hours=2),
'SIMILAR': timedelta(hours=24),
'POTENTIAL_DUPLICATE': timedelta(days=7)
}
def detect_duplication(self, incident_a: Dict, incident_b: Dict) -> Optional[DuplicationResult]:
"""
Detect if two incidents are duplicates
"""
# Calculate various similarity metrics
text_similarity = self._calculate_text_similarity(incident_a, incident_b)
temporal_proximity = self._calculate_temporal_proximity(incident_a, incident_b)
service_similarity = self._calculate_service_similarity(incident_a, incident_b)
metadata_similarity = self._calculate_metadata_similarity(incident_a, incident_b)
# Calculate overall similarity score
overall_similarity = (
text_similarity * 0.5 +
temporal_proximity * 0.2 +
service_similarity * 0.2 +
metadata_similarity * 0.1
)
# Determine duplication type
duplication_type = self._determine_duplication_type(overall_similarity, text_similarity, temporal_proximity)
if duplication_type == 'NO_DUPLICATE':
return None
# Calculate confidence score
confidence_score = self._calculate_confidence_score(
overall_similarity, text_similarity, temporal_proximity, service_similarity
)
# Determine recommended action
recommended_action = self._determine_recommended_action(confidence_score, duplication_type)
# Calculate merge confidence
merge_confidence = self._calculate_merge_confidence(
confidence_score, duplication_type, incident_a, incident_b
)
# Generate reasoning
reasoning = self._generate_reasoning(
duplication_type, text_similarity, temporal_proximity, service_similarity
)
# Extract shared elements
shared_elements = self._extract_shared_elements(incident_a, incident_b)
return DuplicationResult(
duplication_type=duplication_type,
similarity_score=overall_similarity,
confidence_score=confidence_score,
text_similarity=text_similarity,
temporal_proximity=temporal_proximity,
service_similarity=service_similarity,
recommended_action=recommended_action,
merge_confidence=merge_confidence,
reasoning=reasoning,
shared_elements=shared_elements
)
def _calculate_text_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate text similarity between incidents"""
# Combine all text fields
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')} {incident_a.get('free_text', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')} {incident_b.get('free_text', '')}".lower()
# Calculate multiple similarity metrics
jaccard_similarity = self._calculate_jaccard_similarity(text_a, text_b)
cosine_similarity = self._calculate_cosine_similarity(text_a, text_b)
phrase_similarity = self._calculate_phrase_similarity(text_a, text_b)
semantic_similarity = self._calculate_semantic_similarity(text_a, text_b)
# Weighted combination
return (
jaccard_similarity * 0.3 +
cosine_similarity * 0.3 +
phrase_similarity * 0.2 +
semantic_similarity * 0.2
)
def _calculate_jaccard_similarity(self, text_a: str, text_b: str) -> float:
"""Calculate Jaccard similarity based on word sets"""
words_a = set(text_a.split())
words_b = set(text_b.split())
if not words_a or not words_b:
return 0.0
intersection = len(words_a.intersection(words_b))
union = len(words_a.union(words_b))
return intersection / union if union > 0 else 0.0
def _calculate_cosine_similarity(self, text_a: str, text_b: str) -> float:
"""Calculate cosine similarity based on word frequency"""
from collections import Counter
words_a = Counter(text_a.split())
words_b = Counter(text_b.split())
# Get all unique words
all_words = set(words_a.keys()) | set(words_b.keys())
if not all_words:
return 0.0
# Create vectors
vector_a = [words_a.get(word, 0) for word in all_words]
vector_b = [words_b.get(word, 0) for word in all_words]
# Calculate cosine similarity
dot_product = sum(a * b for a, b in zip(vector_a, vector_b))
magnitude_a = sum(a * a for a in vector_a) ** 0.5
magnitude_b = sum(b * b for b in vector_b) ** 0.5
if magnitude_a == 0 or magnitude_b == 0:
return 0.0
return dot_product / (magnitude_a * magnitude_b)
def _calculate_phrase_similarity(self, text_a: str, text_b: str) -> float:
"""Calculate similarity based on common phrases"""
# Extract 2-3 word phrases
phrases_a = set()
phrases_b = set()
words_a = text_a.split()
words_b = text_b.split()
# Extract 2-word phrases
for i in range(len(words_a) - 1):
phrases_a.add(f"{words_a[i]} {words_a[i+1]}")
for i in range(len(words_b) - 1):
phrases_b.add(f"{words_b[i]} {words_b[i+1]}")
# Extract 3-word phrases
for i in range(len(words_a) - 2):
phrases_a.add(f"{words_a[i]} {words_a[i+1]} {words_a[i+2]}")
for i in range(len(words_b) - 2):
phrases_b.add(f"{words_b[i]} {words_b[i+1]} {words_b[i+2]}")
if not phrases_a or not phrases_b:
return 0.0
intersection = len(phrases_a.intersection(phrases_b))
union = len(phrases_a.union(phrases_b))
return intersection / union if union > 0 else 0.0
def _calculate_semantic_similarity(self, text_a: str, text_b: str) -> float:
"""Calculate semantic similarity using keyword analysis"""
# Extract keywords using the classifier
keywords_a = set(self.classifier._extract_keywords(text_a))
keywords_b = set(self.classifier._extract_keywords(text_b))
if not keywords_a or not keywords_b:
return 0.0
# Calculate semantic similarity based on keyword overlap
intersection = len(keywords_a.intersection(keywords_b))
union = len(keywords_a.union(keywords_b))
base_similarity = intersection / union if union > 0 else 0.0
# Boost similarity for technical terms
technical_terms = {
'error', 'exception', 'timeout', 'connection', 'database', 'server',
'api', 'service', 'application', 'network', 'storage', 'memory',
'cpu', 'disk', 'bandwidth', 'latency', 'performance', 'crash'
}
technical_intersection = len(keywords_a.intersection(keywords_b).intersection(technical_terms))
if technical_intersection > 0:
base_similarity += 0.1 * technical_intersection
return min(base_similarity, 1.0)
def _calculate_temporal_proximity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate temporal proximity between incidents"""
created_a = incident_a.get('created_at')
created_b = incident_b.get('created_at')
if not created_a or not created_b:
return 0.0
# Convert to datetime if needed
if isinstance(created_a, str):
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
if isinstance(created_b, str):
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
time_diff = abs((created_a - created_b).total_seconds())
# Calculate proximity score based on time difference
if time_diff <= 300: # 5 minutes
return 1.0
elif time_diff <= 1800: # 30 minutes
return 0.9
elif time_diff <= 3600: # 1 hour
return 0.7
elif time_diff <= 7200: # 2 hours
return 0.5
elif time_diff <= 86400: # 24 hours
return 0.3
elif time_diff <= 604800: # 7 days
return 0.1
else:
return 0.0
def _calculate_service_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate service/component similarity"""
# Extract service information from text
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
# Service/component keywords
service_keywords = [
'api', 'service', 'database', 'server', 'application', 'website', 'mobile',
'frontend', 'backend', 'microservice', 'gateway', 'load balancer', 'cache',
'queue', 'message', 'notification', 'email', 'sms', 'payment', 'auth',
'user service', 'order service', 'payment service', 'notification service'
]
services_a = set()
services_b = set()
for keyword in service_keywords:
if keyword in text_a:
services_a.add(keyword)
if keyword in text_b:
services_b.add(keyword)
if not services_a or not services_b:
return 0.0
intersection = len(services_a.intersection(services_b))
union = len(services_a.union(services_b))
return intersection / union if union > 0 else 0.0
def _calculate_metadata_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
"""Calculate similarity based on metadata fields"""
similarity_score = 0.0
total_fields = 0
# Compare severity
if incident_a.get('severity') == incident_b.get('severity'):
similarity_score += 1.0
total_fields += 1
# Compare status
if incident_a.get('status') == incident_b.get('status'):
similarity_score += 1.0
total_fields += 1
# Compare category
if incident_a.get('category') == incident_b.get('category'):
similarity_score += 1.0
total_fields += 1
# Compare assigned user
if incident_a.get('assigned_to') == incident_b.get('assigned_to'):
similarity_score += 1.0
total_fields += 1
# Compare reporter
if incident_a.get('reporter') == incident_b.get('reporter'):
similarity_score += 1.0
total_fields += 1
return similarity_score / total_fields if total_fields > 0 else 0.0
def _determine_duplication_type(self, overall_similarity: float, text_similarity: float,
temporal_proximity: float) -> str:
"""Determine the type of duplication"""
if overall_similarity >= self.duplication_thresholds['EXACT']:
return 'EXACT'
elif overall_similarity >= self.duplication_thresholds['NEAR_DUPLICATE']:
return 'NEAR_DUPLICATE'
elif overall_similarity >= self.duplication_thresholds['SIMILAR']:
return 'SIMILAR'
elif overall_similarity >= self.duplication_thresholds['POTENTIAL_DUPLICATE']:
return 'POTENTIAL_DUPLICATE'
else:
return 'NO_DUPLICATE'
def _calculate_confidence_score(self, overall_similarity: float, text_similarity: float,
temporal_proximity: float, service_similarity: float) -> float:
"""Calculate confidence score for duplication detection"""
base_confidence = overall_similarity
# Boost confidence for high text similarity
if text_similarity > 0.8:
base_confidence += 0.1
# Boost confidence for high temporal proximity
if temporal_proximity > 0.8:
base_confidence += 0.1
# Boost confidence for high service similarity
if service_similarity > 0.8:
base_confidence += 0.05
return min(base_confidence, 1.0)
def _determine_recommended_action(self, confidence_score: float, duplication_type: str) -> str:
"""Determine recommended action based on confidence and duplication type"""
if confidence_score >= self.action_thresholds['MERGE']:
return 'MERGE'
elif confidence_score >= self.action_thresholds['LINK']:
return 'LINK'
elif confidence_score >= self.action_thresholds['REVIEW']:
return 'REVIEW'
else:
return 'NO_ACTION'
def _calculate_merge_confidence(self, confidence_score: float, duplication_type: str,
incident_a: Dict, incident_b: Dict) -> float:
"""Calculate confidence for merging incidents"""
merge_confidence = confidence_score
# Adjust based on duplication type
type_adjustments = {
'EXACT': 0.1,
'NEAR_DUPLICATE': 0.05,
'SIMILAR': 0.0,
'POTENTIAL_DUPLICATE': -0.1
}
merge_confidence += type_adjustments.get(duplication_type, 0.0)
# Adjust based on incident status
if incident_a.get('status') == incident_b.get('status'):
merge_confidence += 0.05
# Adjust based on severity
if incident_a.get('severity') == incident_b.get('severity'):
merge_confidence += 0.03
return min(max(merge_confidence, 0.0), 1.0)
def _generate_reasoning(self, duplication_type: str, text_similarity: float,
temporal_proximity: float, service_similarity: float) -> str:
"""Generate human-readable reasoning for duplication detection"""
reasoning_parts = []
if text_similarity > 0.8:
reasoning_parts.append(f"Very high text similarity ({text_similarity:.1%})")
elif text_similarity > 0.6:
reasoning_parts.append(f"High text similarity ({text_similarity:.1%})")
elif text_similarity > 0.4:
reasoning_parts.append(f"Moderate text similarity ({text_similarity:.1%})")
if temporal_proximity > 0.8:
reasoning_parts.append(f"Very close temporal proximity ({temporal_proximity:.1%})")
elif temporal_proximity > 0.6:
reasoning_parts.append(f"Close temporal proximity ({temporal_proximity:.1%})")
if service_similarity > 0.8:
reasoning_parts.append(f"Very high service similarity ({service_similarity:.1%})")
elif service_similarity > 0.6:
reasoning_parts.append(f"High service similarity ({service_similarity:.1%})")
if duplication_type == 'EXACT':
reasoning_parts.append("Incidents appear to be exact duplicates")
elif duplication_type == 'NEAR_DUPLICATE':
reasoning_parts.append("Incidents appear to be near duplicates")
elif duplication_type == 'SIMILAR':
reasoning_parts.append("Incidents appear to be similar")
elif duplication_type == 'POTENTIAL_DUPLICATE':
reasoning_parts.append("Incidents may be duplicates")
return "; ".join(reasoning_parts) if reasoning_parts else "Based on overall similarity analysis"
def _extract_shared_elements(self, incident_a: Dict, incident_b: Dict) -> List[str]:
"""Extract elements shared between incidents"""
shared_elements = []
# Shared keywords
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
keywords_a = set(self.classifier._extract_keywords(text_a))
keywords_b = set(self.classifier._extract_keywords(text_b))
shared_keywords = keywords_a.intersection(keywords_b)
if shared_keywords:
shared_elements.append(f"Keywords: {', '.join(list(shared_keywords)[:5])}")
# Shared services
service_keywords = [
'api', 'service', 'database', 'server', 'application', 'website', 'mobile'
]
services_a = set()
services_b = set()
for keyword in service_keywords:
if keyword in text_a:
services_a.add(keyword)
if keyword in text_b:
services_b.add(keyword)
shared_services = services_a.intersection(services_b)
if shared_services:
shared_elements.append(f"Services: {', '.join(shared_services)}")
# Shared metadata
if incident_a.get('severity') == incident_b.get('severity'):
shared_elements.append(f"Severity: {incident_a.get('severity')}")
if incident_a.get('category') == incident_b.get('category'):
shared_elements.append(f"Category: {incident_a.get('category')}")
if incident_a.get('status') == incident_b.get('status'):
shared_elements.append(f"Status: {incident_a.get('status')}")
return shared_elements
def find_duplicate_candidates(self, target_incident: Dict, all_incidents: List[Dict],
limit: int = 10) -> List[Tuple[Dict, DuplicationResult]]:
"""Find incidents that might be duplicates of the target incident"""
candidates = []
for incident in all_incidents:
if incident['id'] == target_incident['id']:
continue
duplication = self.detect_duplication(target_incident, incident)
if duplication:
candidates.append((incident, duplication))
# Sort by confidence score and return top results
candidates.sort(key=lambda x: x[1].confidence_score, reverse=True)
return candidates[:limit]
def batch_detect_duplicates(self, incidents: List[Dict]) -> List[Tuple[Dict, Dict, DuplicationResult]]:
"""Batch detect duplicates in a list of incidents"""
duplicates = []
processed_pairs = set()
for i, incident_a in enumerate(incidents):
for j, incident_b in enumerate(incidents[i+1:], i+1):
# Create a unique pair identifier
pair_id = tuple(sorted([incident_a['id'], incident_b['id']]))
if pair_id in processed_pairs:
continue
processed_pairs.add(pair_id)
duplication = self.detect_duplication(incident_a, incident_b)
if duplication:
duplicates.append((incident_a, incident_b, duplication))
# Sort by confidence score
duplicates.sort(key=lambda x: x[2].confidence_score, reverse=True)
return duplicates