Updates
This commit is contained in:
1
ETB-API/incident_intelligence/ai/__init__.py
Normal file
1
ETB-API/incident_intelligence/ai/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# AI components for incident intelligence
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
471
ETB-API/incident_intelligence/ai/classification.py
Normal file
471
ETB-API/incident_intelligence/ai/classification.py
Normal file
@@ -0,0 +1,471 @@
|
||||
"""
|
||||
AI-driven incident classification using NLP techniques
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClassificationResult:
|
||||
"""Result of incident classification"""
|
||||
category: str
|
||||
subcategory: str
|
||||
confidence: float
|
||||
alternative_categories: List[Dict[str, float]]
|
||||
keywords: List[str]
|
||||
sentiment_score: float
|
||||
urgency_indicators: List[str]
|
||||
|
||||
|
||||
class IncidentClassifier:
|
||||
"""
|
||||
AI-driven incident classifier using rule-based and ML techniques
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.model_version = "v1.0"
|
||||
|
||||
# Predefined categories and their keywords
|
||||
self.categories = {
|
||||
'INFRASTRUCTURE': {
|
||||
'keywords': ['server', 'database', 'network', 'storage', 'disk', 'memory', 'cpu', 'load', 'bandwidth', 'connection', 'timeout', 'latency'],
|
||||
'subcategories': {
|
||||
'SERVER_ISSUE': ['server', 'host', 'machine', 'instance', 'vm', 'container'],
|
||||
'DATABASE_ISSUE': ['database', 'db', 'sql', 'query', 'connection', 'timeout', 'deadlock'],
|
||||
'NETWORK_ISSUE': ['network', 'connectivity', 'dns', 'firewall', 'routing', 'packet', 'bandwidth'],
|
||||
'STORAGE_ISSUE': ['storage', 'disk', 'volume', 'space', 'capacity', 'i/o', 'read', 'write'],
|
||||
}
|
||||
},
|
||||
'APPLICATION': {
|
||||
'keywords': ['application', 'app', 'service', 'api', 'endpoint', 'response', 'error', 'exception', 'crash', 'bug'],
|
||||
'subcategories': {
|
||||
'API_ISSUE': ['api', 'endpoint', 'response', 'status', 'code', 'timeout', 'rate', 'limit'],
|
||||
'SERVICE_ISSUE': ['service', 'microservice', 'dependency', 'circuit', 'breaker', 'fallback'],
|
||||
'PERFORMANCE_ISSUE': ['performance', 'slow', 'latency', 'response', 'time', 'throughput', 'bottleneck'],
|
||||
'FUNCTIONALITY_ISSUE': ['bug', 'feature', 'functionality', 'behavior', 'unexpected', 'incorrect'],
|
||||
}
|
||||
},
|
||||
'SECURITY': {
|
||||
'keywords': ['security', 'authentication', 'authorization', 'access', 'permission', 'breach', 'attack', 'vulnerability', 'malware'],
|
||||
'subcategories': {
|
||||
'AUTH_ISSUE': ['authentication', 'login', 'password', 'token', 'session', 'credential'],
|
||||
'ACCESS_ISSUE': ['authorization', 'permission', 'access', 'denied', 'forbidden', 'unauthorized'],
|
||||
'THREAT_ISSUE': ['attack', 'breach', 'malware', 'virus', 'intrusion', 'suspicious', 'anomaly'],
|
||||
'VULNERABILITY': ['vulnerability', 'exploit', 'patch', 'update', 'security', 'fix'],
|
||||
}
|
||||
},
|
||||
'USER_EXPERIENCE': {
|
||||
'keywords': ['user', 'interface', 'ui', 'ux', 'experience', 'usability', 'navigation', 'button', 'form', 'page'],
|
||||
'subcategories': {
|
||||
'UI_ISSUE': ['interface', 'ui', 'button', 'form', 'page', 'layout', 'display', 'rendering'],
|
||||
'NAVIGATION_ISSUE': ['navigation', 'menu', 'link', 'redirect', 'routing', 'page', 'not', 'found'],
|
||||
'USABILITY_ISSUE': ['usability', 'experience', 'confusing', 'difficult', 'unclear', 'intuitive'],
|
||||
'MOBILE_ISSUE': ['mobile', 'app', 'responsive', 'device', 'screen', 'touch', 'gesture'],
|
||||
}
|
||||
},
|
||||
'DATA': {
|
||||
'keywords': ['data', 'file', 'import', 'export', 'sync', 'backup', 'recovery', 'corruption', 'missing', 'duplicate'],
|
||||
'subcategories': {
|
||||
'DATA_CORRUPTION': ['corruption', 'corrupted', 'invalid', 'malformed', 'broken', 'damaged'],
|
||||
'DATA_LOSS': ['missing', 'lost', 'deleted', 'removed', 'disappeared', 'not', 'found'],
|
||||
'SYNC_ISSUE': ['sync', 'synchronization', 'conflict', 'merge', 'update', 'latest'],
|
||||
'BACKUP_ISSUE': ['backup', 'restore', 'recovery', 'archive', 'retention', 'storage'],
|
||||
}
|
||||
},
|
||||
'INTEGRATION': {
|
||||
'keywords': ['integration', 'third-party', 'external', 'webhook', 'api', 'connection', 'sync', 'import', 'export'],
|
||||
'subcategories': {
|
||||
'THIRD_PARTY_ISSUE': ['third-party', 'external', 'vendor', 'partner', 'service', 'provider'],
|
||||
'WEBHOOK_ISSUE': ['webhook', 'callback', 'notification', 'event', 'trigger', 'delivery'],
|
||||
'API_INTEGRATION': ['api', 'integration', 'endpoint', 'connection', 'authentication', 'response'],
|
||||
'DATA_INTEGRATION': ['import', 'export', 'migration', 'transformation', 'mapping', 'format'],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Urgency indicators
|
||||
self.urgency_indicators = {
|
||||
'CRITICAL': ['down', 'outage', 'critical', 'emergency', 'urgent', 'immediate', 'severe', 'complete', 'total'],
|
||||
'HIGH': ['major', 'significant', 'important', 'priority', 'escalate', 'escalated', 'blocking'],
|
||||
'MEDIUM': ['moderate', 'some', 'partial', 'intermittent', 'occasional', 'sometimes'],
|
||||
'LOW': ['minor', 'small', 'cosmetic', 'enhancement', 'improvement', 'suggestion']
|
||||
}
|
||||
|
||||
# Sentiment analysis keywords
|
||||
self.sentiment_keywords = {
|
||||
'positive': ['working', 'fixed', 'resolved', 'good', 'excellent', 'improved', 'better', 'success'],
|
||||
'negative': ['broken', 'failed', 'error', 'issue', 'problem', 'bug', 'crash', 'down', 'slow', 'terrible', 'awful'],
|
||||
'neutral': ['report', 'incident', 'ticket', 'request', 'update', 'status', 'information']
|
||||
}
|
||||
|
||||
def classify_incident(self, title: str, description: str, free_text: str = "") -> ClassificationResult:
|
||||
"""
|
||||
Classify an incident based on its text content
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Combine all text for analysis
|
||||
combined_text = f"{title} {description} {free_text}".lower()
|
||||
|
||||
# Extract keywords
|
||||
keywords = self._extract_keywords(combined_text)
|
||||
|
||||
# Analyze sentiment
|
||||
sentiment_score = self._analyze_sentiment(combined_text)
|
||||
|
||||
# Detect urgency indicators
|
||||
urgency_indicators = self._detect_urgency_indicators(combined_text)
|
||||
|
||||
# Classify category and subcategory
|
||||
category, subcategory, confidence, alternatives = self._classify_category(combined_text, keywords)
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
return ClassificationResult(
|
||||
category=category,
|
||||
subcategory=subcategory,
|
||||
confidence=confidence,
|
||||
alternative_categories=alternatives,
|
||||
keywords=keywords,
|
||||
sentiment_score=sentiment_score,
|
||||
urgency_indicators=urgency_indicators
|
||||
)
|
||||
|
||||
def _extract_keywords(self, text: str) -> List[str]:
|
||||
"""Extract relevant keywords from text"""
|
||||
# Simple keyword extraction - in production, use more sophisticated NLP
|
||||
words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
|
||||
|
||||
# Filter out common stop words
|
||||
stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'a', 'an'}
|
||||
|
||||
keywords = [word for word in words if word not in stop_words]
|
||||
|
||||
# Count frequency and return top keywords
|
||||
from collections import Counter
|
||||
keyword_counts = Counter(keywords)
|
||||
return [word for word, count in keyword_counts.most_common(10)]
|
||||
|
||||
def _analyze_sentiment(self, text: str) -> float:
|
||||
"""Analyze sentiment of the text (-1 to 1)"""
|
||||
positive_count = sum(1 for word in self.sentiment_keywords['positive'] if word in text)
|
||||
negative_count = sum(1 for word in self.sentiment_keywords['negative'] if word in text)
|
||||
|
||||
total_sentiment_words = positive_count + negative_count
|
||||
if total_sentiment_words == 0:
|
||||
return 0.0
|
||||
|
||||
return (positive_count - negative_count) / total_sentiment_words
|
||||
|
||||
def _detect_urgency_indicators(self, text: str) -> List[str]:
|
||||
"""Detect urgency indicators in the text"""
|
||||
detected_indicators = []
|
||||
|
||||
for urgency_level, indicators in self.urgency_indicators.items():
|
||||
for indicator in indicators:
|
||||
if indicator in text:
|
||||
detected_indicators.append(f"{urgency_level}: {indicator}")
|
||||
|
||||
return detected_indicators
|
||||
|
||||
def _classify_category(self, text: str, keywords: List[str]) -> Tuple[str, str, float, List[Dict[str, float]]]:
|
||||
"""Classify the incident category and subcategory"""
|
||||
category_scores = {}
|
||||
subcategory_scores = {}
|
||||
|
||||
# Score each category based on keyword matches
|
||||
for category, data in self.categories.items():
|
||||
score = 0
|
||||
category_keywords = data['keywords']
|
||||
|
||||
# Count keyword matches
|
||||
for keyword in category_keywords:
|
||||
if keyword in text:
|
||||
score += 1
|
||||
# Also check for partial matches in keywords list
|
||||
for extracted_keyword in keywords:
|
||||
if keyword in extracted_keyword or extracted_keyword in keyword:
|
||||
score += 0.5
|
||||
|
||||
category_scores[category] = score
|
||||
|
||||
# Score subcategories
|
||||
for subcategory, subcategory_keywords in data['subcategories'].items():
|
||||
subcategory_score = 0
|
||||
for keyword in subcategory_keywords:
|
||||
if keyword in text:
|
||||
subcategory_score += 1
|
||||
for extracted_keyword in keywords:
|
||||
if keyword in extracted_keyword or extracted_keyword in keyword:
|
||||
subcategory_score += 0.5
|
||||
|
||||
subcategory_scores[subcategory] = subcategory_score
|
||||
|
||||
# Find best category
|
||||
if not category_scores or max(category_scores.values()) == 0:
|
||||
best_category = 'GENERAL'
|
||||
best_subcategory = 'UNKNOWN'
|
||||
confidence = 0.1
|
||||
else:
|
||||
best_category = max(category_scores, key=category_scores.get)
|
||||
max_score = max(category_scores.values())
|
||||
confidence = min(max_score / 10.0, 1.0) # Normalize to 0-1
|
||||
|
||||
# Find best subcategory within the category
|
||||
if best_category in self.categories:
|
||||
category_subcategories = self.categories[best_category]['subcategories']
|
||||
subcategory_scores_filtered = {k: v for k, v in subcategory_scores.items() if k in category_subcategories}
|
||||
|
||||
if subcategory_scores_filtered and max(subcategory_scores_filtered.values()) > 0:
|
||||
best_subcategory = max(subcategory_scores_filtered, key=subcategory_scores_filtered.get)
|
||||
else:
|
||||
best_subcategory = 'GENERAL'
|
||||
else:
|
||||
best_subcategory = 'GENERAL'
|
||||
|
||||
# Create alternative categories
|
||||
alternatives = []
|
||||
sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
|
||||
for category, score in sorted_categories[:3]:
|
||||
if category != best_category and score > 0:
|
||||
alternatives.append({
|
||||
'category': category,
|
||||
'confidence': min(score / 10.0, 1.0)
|
||||
})
|
||||
|
||||
return best_category, best_subcategory, confidence, alternatives
|
||||
|
||||
|
||||
class SeverityAnalyzer:
|
||||
"""
|
||||
AI-driven severity analyzer based on impact assessment
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.model_version = "v1.0"
|
||||
|
||||
# Severity indicators
|
||||
self.severity_indicators = {
|
||||
'EMERGENCY': {
|
||||
'keywords': ['down', 'outage', 'critical', 'emergency', 'complete', 'total', 'all', 'entire', 'system'],
|
||||
'impact_multiplier': 2.0,
|
||||
'user_impact_threshold': 0.8,
|
||||
'business_impact_threshold': 0.9
|
||||
},
|
||||
'CRITICAL': {
|
||||
'keywords': ['major', 'significant', 'severe', 'blocking', 'cannot', 'unable', 'failed', 'broken'],
|
||||
'impact_multiplier': 1.5,
|
||||
'user_impact_threshold': 0.6,
|
||||
'business_impact_threshold': 0.7
|
||||
},
|
||||
'HIGH': {
|
||||
'keywords': ['important', 'priority', 'escalate', 'escalated', 'urgent', 'immediate', 'soon'],
|
||||
'impact_multiplier': 1.2,
|
||||
'user_impact_threshold': 0.4,
|
||||
'business_impact_threshold': 0.5
|
||||
},
|
||||
'MEDIUM': {
|
||||
'keywords': ['moderate', 'some', 'partial', 'intermittent', 'occasional', 'sometimes', 'minor'],
|
||||
'impact_multiplier': 1.0,
|
||||
'user_impact_threshold': 0.2,
|
||||
'business_impact_threshold': 0.3
|
||||
},
|
||||
'LOW': {
|
||||
'keywords': ['small', 'cosmetic', 'enhancement', 'improvement', 'suggestion', 'nice', 'to', 'have'],
|
||||
'impact_multiplier': 0.5,
|
||||
'user_impact_threshold': 0.1,
|
||||
'business_impact_threshold': 0.1
|
||||
}
|
||||
}
|
||||
|
||||
def analyze_severity(self, incident_data: Dict) -> Dict:
|
||||
"""
|
||||
Analyze incident severity based on various factors
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
title = incident_data.get('title', '').lower()
|
||||
description = incident_data.get('description', '').lower()
|
||||
free_text = incident_data.get('free_text', '').lower()
|
||||
affected_users = incident_data.get('affected_users', 0)
|
||||
business_impact = incident_data.get('business_impact', '').lower()
|
||||
|
||||
combined_text = f"{title} {description} {free_text} {business_impact}"
|
||||
|
||||
# Calculate impact scores
|
||||
user_impact_score = self._calculate_user_impact(affected_users, combined_text)
|
||||
business_impact_score = self._calculate_business_impact(business_impact, combined_text)
|
||||
technical_impact_score = self._calculate_technical_impact(combined_text)
|
||||
|
||||
# Determine severity based on impact scores and keywords
|
||||
suggested_severity, confidence, reasoning, impact_factors = self._determine_severity(
|
||||
combined_text, user_impact_score, business_impact_score, technical_impact_score
|
||||
)
|
||||
|
||||
processing_time = time.time() - start_time
|
||||
|
||||
return {
|
||||
'suggested_severity': suggested_severity,
|
||||
'confidence_score': confidence,
|
||||
'user_impact_score': user_impact_score,
|
||||
'business_impact_score': business_impact_score,
|
||||
'technical_impact_score': technical_impact_score,
|
||||
'reasoning': reasoning,
|
||||
'impact_factors': impact_factors,
|
||||
'processing_time': processing_time
|
||||
}
|
||||
|
||||
def _calculate_user_impact(self, affected_users: int, text: str) -> float:
|
||||
"""Calculate user impact score (0-1)"""
|
||||
# Base score from affected users count
|
||||
if affected_users == 0:
|
||||
# Try to extract from text
|
||||
user_indicators = ['all users', 'everyone', 'entire user base', 'all customers']
|
||||
if any(indicator in text for indicator in user_indicators):
|
||||
base_score = 0.9
|
||||
else:
|
||||
base_score = 0.1
|
||||
elif affected_users < 10:
|
||||
base_score = 0.2
|
||||
elif affected_users < 100:
|
||||
base_score = 0.4
|
||||
elif affected_users < 1000:
|
||||
base_score = 0.6
|
||||
elif affected_users < 10000:
|
||||
base_score = 0.8
|
||||
else:
|
||||
base_score = 1.0
|
||||
|
||||
# Adjust based on text indicators
|
||||
if 'all' in text or 'everyone' in text:
|
||||
base_score = min(base_score + 0.2, 1.0)
|
||||
elif 'some' in text or 'few' in text:
|
||||
base_score = max(base_score - 0.1, 0.0)
|
||||
|
||||
return base_score
|
||||
|
||||
def _calculate_business_impact(self, business_impact: str, text: str) -> float:
|
||||
"""Calculate business impact score (0-1)"""
|
||||
if not business_impact:
|
||||
# Try to infer from text
|
||||
high_impact_indicators = ['revenue', 'sales', 'customer', 'business', 'critical', 'essential', 'production']
|
||||
if any(indicator in text for indicator in high_impact_indicators):
|
||||
return 0.6
|
||||
return 0.3
|
||||
|
||||
# Analyze business impact text
|
||||
high_impact_keywords = ['revenue', 'sales', 'customer', 'business', 'critical', 'essential', 'production', 'outage', 'down']
|
||||
medium_impact_keywords = ['service', 'feature', 'functionality', 'performance', 'slow']
|
||||
low_impact_keywords = ['cosmetic', 'minor', 'enhancement', 'improvement']
|
||||
|
||||
score = 0.3 # Base score
|
||||
|
||||
for keyword in high_impact_keywords:
|
||||
if keyword in business_impact:
|
||||
score += 0.1
|
||||
|
||||
for keyword in medium_impact_keywords:
|
||||
if keyword in business_impact:
|
||||
score += 0.05
|
||||
|
||||
for keyword in low_impact_keywords:
|
||||
if keyword in business_impact:
|
||||
score -= 0.05
|
||||
|
||||
return min(max(score, 0.0), 1.0)
|
||||
|
||||
def _calculate_technical_impact(self, text: str) -> float:
|
||||
"""Calculate technical impact score (0-1)"""
|
||||
technical_indicators = {
|
||||
'high': ['down', 'outage', 'crash', 'failed', 'broken', 'unavailable', 'error', 'exception'],
|
||||
'medium': ['slow', 'performance', 'latency', 'timeout', 'intermittent', 'partial'],
|
||||
'low': ['cosmetic', 'display', 'ui', 'minor', 'enhancement']
|
||||
}
|
||||
|
||||
score = 0.3 # Base score
|
||||
|
||||
for level, keywords in technical_indicators.items():
|
||||
for keyword in keywords:
|
||||
if keyword in text:
|
||||
if level == 'high':
|
||||
score += 0.15
|
||||
elif level == 'medium':
|
||||
score += 0.08
|
||||
elif level == 'low':
|
||||
score -= 0.05
|
||||
|
||||
return min(max(score, 0.0), 1.0)
|
||||
|
||||
def _determine_severity(self, text: str, user_impact: float, business_impact: float, technical_impact: float) -> Tuple[str, float, str, List[str]]:
|
||||
"""Determine severity based on impact scores and text analysis"""
|
||||
impact_factors = []
|
||||
|
||||
# Calculate weighted impact score
|
||||
weighted_score = (user_impact * 0.4 + business_impact * 0.4 + technical_impact * 0.2)
|
||||
|
||||
# Check for severity indicators in text
|
||||
severity_scores = {}
|
||||
for severity, data in self.severity_indicators.items():
|
||||
score = 0
|
||||
for keyword in data['keywords']:
|
||||
if keyword in text:
|
||||
score += 1
|
||||
|
||||
# Apply impact multiplier
|
||||
score *= data['impact_multiplier']
|
||||
severity_scores[severity] = score
|
||||
|
||||
# Find best severity match
|
||||
if severity_scores and max(severity_scores.values()) > 0:
|
||||
best_severity = max(severity_scores, key=severity_scores.get)
|
||||
text_confidence = min(max(severity_scores.values()) / 5.0, 1.0)
|
||||
else:
|
||||
# Fallback to impact-based severity
|
||||
if weighted_score >= 0.8:
|
||||
best_severity = 'CRITICAL'
|
||||
elif weighted_score >= 0.6:
|
||||
best_severity = 'HIGH'
|
||||
elif weighted_score >= 0.4:
|
||||
best_severity = 'MEDIUM'
|
||||
else:
|
||||
best_severity = 'LOW'
|
||||
text_confidence = 0.5
|
||||
|
||||
# Combine text and impact confidence
|
||||
confidence = (text_confidence + (1.0 - abs(weighted_score - self._severity_to_score(best_severity)))) / 2.0
|
||||
|
||||
# Generate reasoning
|
||||
reasoning_parts = []
|
||||
if user_impact > 0.6:
|
||||
reasoning_parts.append(f"High user impact ({user_impact:.1%})")
|
||||
impact_factors.append(f"User Impact: {user_impact:.1%}")
|
||||
|
||||
if business_impact > 0.6:
|
||||
reasoning_parts.append(f"Significant business impact ({business_impact:.1%})")
|
||||
impact_factors.append(f"Business Impact: {business_impact:.1%}")
|
||||
|
||||
if technical_impact > 0.6:
|
||||
reasoning_parts.append(f"Major technical impact ({technical_impact:.1%})")
|
||||
impact_factors.append(f"Technical Impact: {technical_impact:.1%}")
|
||||
|
||||
if severity_scores and max(severity_scores.values()) > 0:
|
||||
reasoning_parts.append("Severity indicators detected in incident description")
|
||||
impact_factors.append("Text Analysis: Severity keywords found")
|
||||
|
||||
reasoning = "; ".join(reasoning_parts) if reasoning_parts else "Based on overall impact assessment"
|
||||
|
||||
return best_severity, confidence, reasoning, impact_factors
|
||||
|
||||
def _severity_to_score(self, severity: str) -> float:
|
||||
"""Convert severity level to numeric score"""
|
||||
severity_scores = {
|
||||
'LOW': 0.2,
|
||||
'MEDIUM': 0.4,
|
||||
'HIGH': 0.6,
|
||||
'CRITICAL': 0.8,
|
||||
'EMERGENCY': 1.0
|
||||
}
|
||||
return severity_scores.get(severity, 0.4)
|
||||
481
ETB-API/incident_intelligence/ai/correlation.py
Normal file
481
ETB-API/incident_intelligence/ai/correlation.py
Normal file
@@ -0,0 +1,481 @@
|
||||
"""
|
||||
Correlation engine for linking related incidents and problem detection
|
||||
"""
|
||||
import time
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta
|
||||
from django.utils import timezone
|
||||
from .classification import IncidentClassifier
|
||||
|
||||
|
||||
@dataclass
|
||||
class CorrelationResult:
|
||||
"""Result of incident correlation analysis"""
|
||||
correlation_type: str
|
||||
confidence_score: float
|
||||
correlation_strength: str
|
||||
shared_keywords: List[str]
|
||||
time_difference: timedelta
|
||||
similarity_score: float
|
||||
is_problem_indicator: bool
|
||||
problem_description: Optional[str]
|
||||
|
||||
|
||||
class IncidentCorrelationEngine:
|
||||
"""
|
||||
AI-driven correlation engine for linking related incidents
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.model_version = "v1.0"
|
||||
self.classifier = IncidentClassifier()
|
||||
|
||||
# Correlation thresholds
|
||||
self.correlation_thresholds = {
|
||||
'VERY_STRONG': 0.9,
|
||||
'STRONG': 0.7,
|
||||
'MODERATE': 0.5,
|
||||
'WEAK': 0.3
|
||||
}
|
||||
|
||||
# Problem detection patterns
|
||||
self.problem_patterns = {
|
||||
'CASCADE_FAILURE': {
|
||||
'keywords': ['cascade', 'chain', 'reaction', 'domino', 'ripple', 'effect'],
|
||||
'time_window': timedelta(hours=2),
|
||||
'min_incidents': 3
|
||||
},
|
||||
'RECURRING_ISSUE': {
|
||||
'keywords': ['same', 'again', 'recurring', 'repeated', 'similar', 'identical'],
|
||||
'time_window': timedelta(days=7),
|
||||
'min_incidents': 2
|
||||
},
|
||||
'SERVICE_DEPENDENCY': {
|
||||
'keywords': ['dependency', 'dependent', 'downstream', 'upstream', 'service', 'api'],
|
||||
'time_window': timedelta(hours=1),
|
||||
'min_incidents': 2
|
||||
},
|
||||
'INFRASTRUCTURE_PATTERN': {
|
||||
'keywords': ['server', 'database', 'network', 'storage', 'infrastructure'],
|
||||
'time_window': timedelta(hours=4),
|
||||
'min_incidents': 3
|
||||
}
|
||||
}
|
||||
|
||||
def correlate_incidents(self, incident_a: Dict, incident_b: Dict) -> Optional[CorrelationResult]:
|
||||
"""
|
||||
Correlate two incidents and determine if they are related
|
||||
"""
|
||||
# Calculate various similarity metrics
|
||||
text_similarity = self._calculate_text_similarity(incident_a, incident_b)
|
||||
temporal_similarity = self._calculate_temporal_similarity(incident_a, incident_b)
|
||||
service_similarity = self._calculate_service_similarity(incident_a, incident_b)
|
||||
category_similarity = self._calculate_category_similarity(incident_a, incident_b)
|
||||
|
||||
# Calculate overall similarity score
|
||||
overall_similarity = (
|
||||
text_similarity * 0.4 +
|
||||
temporal_similarity * 0.2 +
|
||||
service_similarity * 0.2 +
|
||||
category_similarity * 0.2
|
||||
)
|
||||
|
||||
# Determine if incidents are correlated
|
||||
if overall_similarity < 0.3:
|
||||
return None
|
||||
|
||||
# Determine correlation type
|
||||
correlation_type = self._determine_correlation_type(
|
||||
incident_a, incident_b, text_similarity, temporal_similarity, service_similarity
|
||||
)
|
||||
|
||||
# Calculate confidence score
|
||||
confidence_score = self._calculate_confidence_score(
|
||||
overall_similarity, correlation_type, incident_a, incident_b
|
||||
)
|
||||
|
||||
# Determine correlation strength
|
||||
correlation_strength = self._determine_correlation_strength(confidence_score)
|
||||
|
||||
# Extract shared keywords
|
||||
shared_keywords = self._extract_shared_keywords(incident_a, incident_b)
|
||||
|
||||
# Calculate time difference
|
||||
time_diff = self._calculate_time_difference(incident_a, incident_b)
|
||||
|
||||
# Check for problem indicators
|
||||
is_problem_indicator, problem_description = self._detect_problem_patterns(
|
||||
incident_a, incident_b, correlation_type, confidence_score
|
||||
)
|
||||
|
||||
return CorrelationResult(
|
||||
correlation_type=correlation_type,
|
||||
confidence_score=confidence_score,
|
||||
correlation_strength=correlation_strength,
|
||||
shared_keywords=shared_keywords,
|
||||
time_difference=time_diff,
|
||||
similarity_score=overall_similarity,
|
||||
is_problem_indicator=is_problem_indicator,
|
||||
problem_description=problem_description
|
||||
)
|
||||
|
||||
def _calculate_text_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate text similarity between two incidents"""
|
||||
# Combine text fields
|
||||
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')} {incident_a.get('free_text', '')}".lower()
|
||||
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')} {incident_b.get('free_text', '')}".lower()
|
||||
|
||||
# Extract keywords
|
||||
keywords_a = set(self.classifier._extract_keywords(text_a))
|
||||
keywords_b = set(self.classifier._extract_keywords(text_b))
|
||||
|
||||
if not keywords_a or not keywords_b:
|
||||
return 0.0
|
||||
|
||||
# Calculate Jaccard similarity
|
||||
intersection = len(keywords_a.intersection(keywords_b))
|
||||
union = len(keywords_a.union(keywords_b))
|
||||
|
||||
jaccard_similarity = intersection / union if union > 0 else 0.0
|
||||
|
||||
# Also check for exact phrase matches
|
||||
phrase_similarity = self._calculate_phrase_similarity(text_a, text_b)
|
||||
|
||||
# Combine similarities
|
||||
return (jaccard_similarity * 0.7 + phrase_similarity * 0.3)
|
||||
|
||||
def _calculate_phrase_similarity(self, text_a: str, text_b: str) -> float:
|
||||
"""Calculate similarity based on common phrases"""
|
||||
# Extract 2-3 word phrases
|
||||
phrases_a = set()
|
||||
phrases_b = set()
|
||||
|
||||
words_a = text_a.split()
|
||||
words_b = text_b.split()
|
||||
|
||||
# Extract 2-word phrases
|
||||
for i in range(len(words_a) - 1):
|
||||
phrases_a.add(f"{words_a[i]} {words_a[i+1]}")
|
||||
|
||||
for i in range(len(words_b) - 1):
|
||||
phrases_b.add(f"{words_b[i]} {words_b[i+1]}")
|
||||
|
||||
# Extract 3-word phrases
|
||||
for i in range(len(words_a) - 2):
|
||||
phrases_a.add(f"{words_a[i]} {words_a[i+1]} {words_a[i+2]}")
|
||||
|
||||
for i in range(len(words_b) - 2):
|
||||
phrases_b.add(f"{words_b[i]} {words_b[i+1]} {words_b[i+2]}")
|
||||
|
||||
if not phrases_a or not phrases_b:
|
||||
return 0.0
|
||||
|
||||
intersection = len(phrases_a.intersection(phrases_b))
|
||||
union = len(phrases_a.union(phrases_b))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _calculate_temporal_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate temporal similarity between incidents"""
|
||||
created_a = incident_a.get('created_at')
|
||||
created_b = incident_b.get('created_at')
|
||||
|
||||
if not created_a or not created_b:
|
||||
return 0.0
|
||||
|
||||
# Convert to datetime if needed
|
||||
if isinstance(created_a, str):
|
||||
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
|
||||
if isinstance(created_b, str):
|
||||
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
|
||||
|
||||
time_diff = abs((created_a - created_b).total_seconds())
|
||||
|
||||
# Calculate similarity based on time difference
|
||||
# Incidents within 1 hour: high similarity
|
||||
# Incidents within 24 hours: medium similarity
|
||||
# Incidents within 7 days: low similarity
|
||||
if time_diff <= 3600: # 1 hour
|
||||
return 1.0
|
||||
elif time_diff <= 86400: # 24 hours
|
||||
return 0.7
|
||||
elif time_diff <= 604800: # 7 days
|
||||
return 0.3
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
def _calculate_service_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate service/component similarity"""
|
||||
# Extract service/component information from text
|
||||
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
|
||||
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
|
||||
|
||||
# Common service/component keywords
|
||||
service_keywords = [
|
||||
'api', 'service', 'database', 'server', 'application', 'website', 'mobile',
|
||||
'frontend', 'backend', 'microservice', 'gateway', 'load balancer', 'cache',
|
||||
'queue', 'message', 'notification', 'email', 'sms', 'payment', 'auth'
|
||||
]
|
||||
|
||||
services_a = set()
|
||||
services_b = set()
|
||||
|
||||
for keyword in service_keywords:
|
||||
if keyword in text_a:
|
||||
services_a.add(keyword)
|
||||
if keyword in text_b:
|
||||
services_b.add(keyword)
|
||||
|
||||
if not services_a or not services_b:
|
||||
return 0.0
|
||||
|
||||
intersection = len(services_a.intersection(services_b))
|
||||
union = len(services_a.union(services_b))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _calculate_category_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate category similarity"""
|
||||
category_a = incident_a.get('category', '')
|
||||
category_b = incident_b.get('category', '')
|
||||
|
||||
if not category_a or not category_b:
|
||||
return 0.0
|
||||
|
||||
if category_a == category_b:
|
||||
return 1.0
|
||||
|
||||
# Check for related categories
|
||||
related_categories = {
|
||||
'INFRASTRUCTURE': ['APPLICATION', 'SECURITY'],
|
||||
'APPLICATION': ['INFRASTRUCTURE', 'USER_EXPERIENCE'],
|
||||
'SECURITY': ['INFRASTRUCTURE', 'APPLICATION'],
|
||||
'USER_EXPERIENCE': ['APPLICATION', 'DATA'],
|
||||
'DATA': ['USER_EXPERIENCE', 'INTEGRATION'],
|
||||
'INTEGRATION': ['DATA', 'APPLICATION']
|
||||
}
|
||||
|
||||
if category_b in related_categories.get(category_a, []):
|
||||
return 0.5
|
||||
|
||||
return 0.0
|
||||
|
||||
def _determine_correlation_type(self, incident_a: Dict, incident_b: Dict,
|
||||
text_similarity: float, temporal_similarity: float,
|
||||
service_similarity: float) -> str:
|
||||
"""Determine the type of correlation between incidents"""
|
||||
|
||||
# Same service correlation
|
||||
if service_similarity > 0.7:
|
||||
return 'SAME_SERVICE'
|
||||
|
||||
# Same component correlation
|
||||
if text_similarity > 0.6 and service_similarity > 0.4:
|
||||
return 'SAME_COMPONENT'
|
||||
|
||||
# Temporal correlation
|
||||
if temporal_similarity > 0.7 and text_similarity > 0.3:
|
||||
return 'TEMPORAL'
|
||||
|
||||
# Pattern match
|
||||
if text_similarity > 0.5:
|
||||
return 'PATTERN'
|
||||
|
||||
# Dependency correlation
|
||||
if service_similarity > 0.4 and temporal_similarity > 0.5:
|
||||
return 'DEPENDENCY'
|
||||
|
||||
# Cascade effect
|
||||
if temporal_similarity > 0.8 and text_similarity > 0.4:
|
||||
return 'CASCADE'
|
||||
|
||||
return 'PATTERN' # Default
|
||||
|
||||
def _calculate_confidence_score(self, overall_similarity: float, correlation_type: str,
|
||||
incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate confidence score for the correlation"""
|
||||
base_confidence = overall_similarity
|
||||
|
||||
# Adjust based on correlation type
|
||||
type_adjustments = {
|
||||
'SAME_SERVICE': 0.1,
|
||||
'SAME_COMPONENT': 0.15,
|
||||
'TEMPORAL': 0.05,
|
||||
'PATTERN': 0.0,
|
||||
'DEPENDENCY': 0.1,
|
||||
'CASCADE': 0.2
|
||||
}
|
||||
|
||||
base_confidence += type_adjustments.get(correlation_type, 0.0)
|
||||
|
||||
# Adjust based on incident characteristics
|
||||
if incident_a.get('severity') == incident_b.get('severity'):
|
||||
base_confidence += 0.05
|
||||
|
||||
if incident_a.get('status') == incident_b.get('status'):
|
||||
base_confidence += 0.03
|
||||
|
||||
return min(base_confidence, 1.0)
|
||||
|
||||
def _determine_correlation_strength(self, confidence_score: float) -> str:
|
||||
"""Determine correlation strength based on confidence score"""
|
||||
if confidence_score >= self.correlation_thresholds['VERY_STRONG']:
|
||||
return 'VERY_STRONG'
|
||||
elif confidence_score >= self.correlation_thresholds['STRONG']:
|
||||
return 'STRONG'
|
||||
elif confidence_score >= self.correlation_thresholds['MODERATE']:
|
||||
return 'MODERATE'
|
||||
else:
|
||||
return 'WEAK'
|
||||
|
||||
def _extract_shared_keywords(self, incident_a: Dict, incident_b: Dict) -> List[str]:
|
||||
"""Extract keywords shared between incidents"""
|
||||
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
|
||||
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
|
||||
|
||||
keywords_a = set(self.classifier._extract_keywords(text_a))
|
||||
keywords_b = set(self.classifier._extract_keywords(text_b))
|
||||
|
||||
shared = list(keywords_a.intersection(keywords_b))
|
||||
return shared[:10] # Return top 10 shared keywords
|
||||
|
||||
def _calculate_time_difference(self, incident_a: Dict, incident_b: Dict) -> timedelta:
|
||||
"""Calculate time difference between incidents"""
|
||||
created_a = incident_a.get('created_at')
|
||||
created_b = incident_b.get('created_at')
|
||||
|
||||
if not created_a or not created_b:
|
||||
return timedelta(0)
|
||||
|
||||
# Convert to datetime if needed
|
||||
if isinstance(created_a, str):
|
||||
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
|
||||
if isinstance(created_b, str):
|
||||
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
|
||||
|
||||
return abs(created_a - created_b)
|
||||
|
||||
def _detect_problem_patterns(self, incident_a: Dict, incident_b: Dict,
|
||||
correlation_type: str, confidence_score: float) -> Tuple[bool, Optional[str]]:
|
||||
"""Detect if correlation indicates a larger problem"""
|
||||
|
||||
# High confidence correlations are more likely to indicate problems
|
||||
if confidence_score < 0.6:
|
||||
return False, None
|
||||
|
||||
# Check for specific problem patterns
|
||||
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
|
||||
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
|
||||
combined_text = f"{text_a} {text_b}"
|
||||
|
||||
for pattern_name, pattern_data in self.problem_patterns.items():
|
||||
# Check for pattern keywords
|
||||
keyword_matches = sum(1 for keyword in pattern_data['keywords'] if keyword in combined_text)
|
||||
|
||||
if keyword_matches >= 2: # At least 2 keywords match
|
||||
return True, f"Potential {pattern_name.replace('_', ' ').lower()} detected"
|
||||
|
||||
# Check for cascade effects
|
||||
if correlation_type == 'CASCADE' and confidence_score > 0.7:
|
||||
return True, "Potential cascade failure detected"
|
||||
|
||||
# Check for recurring issues
|
||||
if correlation_type == 'SAME_SERVICE' and confidence_score > 0.8:
|
||||
return True, "Potential recurring service issue detected"
|
||||
|
||||
return False, None
|
||||
|
||||
def find_related_incidents(self, target_incident: Dict, all_incidents: List[Dict],
|
||||
limit: int = 10) -> List[Tuple[Dict, CorrelationResult]]:
|
||||
"""Find incidents related to a target incident"""
|
||||
correlations = []
|
||||
|
||||
for incident in all_incidents:
|
||||
if incident['id'] == target_incident['id']:
|
||||
continue
|
||||
|
||||
correlation = self.correlate_incidents(target_incident, incident)
|
||||
if correlation:
|
||||
correlations.append((incident, correlation))
|
||||
|
||||
# Sort by confidence score and return top results
|
||||
correlations.sort(key=lambda x: x[1].confidence_score, reverse=True)
|
||||
return correlations[:limit]
|
||||
|
||||
def detect_problem_clusters(self, incidents: List[Dict],
|
||||
min_incidents: int = 3,
|
||||
time_window: timedelta = timedelta(hours=24)) -> List[Dict]:
|
||||
"""Detect clusters of related incidents that might indicate larger problems"""
|
||||
clusters = []
|
||||
processed_incidents = set()
|
||||
|
||||
for incident in incidents:
|
||||
if incident['id'] in processed_incidents:
|
||||
continue
|
||||
|
||||
# Find related incidents within time window
|
||||
related_incidents = []
|
||||
incident_time = incident.get('created_at')
|
||||
|
||||
if isinstance(incident_time, str):
|
||||
incident_time = datetime.fromisoformat(incident_time.replace('Z', '+00:00'))
|
||||
|
||||
for other_incident in incidents:
|
||||
if other_incident['id'] == incident['id'] or other_incident['id'] in processed_incidents:
|
||||
continue
|
||||
|
||||
other_time = other_incident.get('created_at')
|
||||
if isinstance(other_time, str):
|
||||
other_time = datetime.fromisoformat(other_time.replace('Z', '+00:00'))
|
||||
|
||||
# Check if within time window
|
||||
if abs((incident_time - other_time).total_seconds()) <= time_window.total_seconds():
|
||||
correlation = self.correlate_incidents(incident, other_incident)
|
||||
if correlation and correlation.confidence_score > 0.5:
|
||||
related_incidents.append((other_incident, correlation))
|
||||
|
||||
# If we found enough related incidents, create a cluster
|
||||
if len(related_incidents) >= min_incidents - 1: # -1 because we include the original incident
|
||||
cluster = {
|
||||
'incidents': [incident] + [inc[0] for inc in related_incidents],
|
||||
'correlations': [inc[1] for inc in related_incidents],
|
||||
'problem_type': self._classify_problem_type(incident, related_incidents),
|
||||
'confidence': sum(inc[1].confidence_score for inc in related_incidents) / len(related_incidents),
|
||||
'time_span': self._calculate_cluster_time_span([incident] + [inc[0] for inc in related_incidents])
|
||||
}
|
||||
clusters.append(cluster)
|
||||
|
||||
# Mark incidents as processed
|
||||
processed_incidents.add(incident['id'])
|
||||
for related_incident, _ in related_incidents:
|
||||
processed_incidents.add(related_incident['id'])
|
||||
|
||||
return clusters
|
||||
|
||||
def _classify_problem_type(self, incident: Dict, related_incidents: List[Tuple[Dict, CorrelationResult]]) -> str:
|
||||
"""Classify the type of problem based on incident cluster"""
|
||||
correlation_types = [corr.correlation_type for _, corr in related_incidents]
|
||||
|
||||
if 'CASCADE' in correlation_types:
|
||||
return 'CASCADE_FAILURE'
|
||||
elif 'SAME_SERVICE' in correlation_types:
|
||||
return 'SERVICE_OUTAGE'
|
||||
elif 'TEMPORAL' in correlation_types:
|
||||
return 'RECURRING_ISSUE'
|
||||
else:
|
||||
return 'PATTERN_BASED_PROBLEM'
|
||||
|
||||
def _calculate_cluster_time_span(self, incidents: List[Dict]) -> timedelta:
|
||||
"""Calculate the time span of a cluster of incidents"""
|
||||
times = []
|
||||
for incident in incidents:
|
||||
created_at = incident.get('created_at')
|
||||
if isinstance(created_at, str):
|
||||
created_at = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
|
||||
times.append(created_at)
|
||||
|
||||
if len(times) < 2:
|
||||
return timedelta(0)
|
||||
|
||||
return max(times) - min(times)
|
||||
516
ETB-API/incident_intelligence/ai/duplication.py
Normal file
516
ETB-API/incident_intelligence/ai/duplication.py
Normal file
@@ -0,0 +1,516 @@
|
||||
"""
|
||||
Duplication detection engine for identifying and merging duplicate incidents
|
||||
"""
|
||||
import time
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta
|
||||
from .classification import IncidentClassifier
|
||||
|
||||
|
||||
@dataclass
|
||||
class DuplicationResult:
|
||||
"""Result of duplication detection analysis"""
|
||||
duplication_type: str
|
||||
similarity_score: float
|
||||
confidence_score: float
|
||||
text_similarity: float
|
||||
temporal_proximity: float
|
||||
service_similarity: float
|
||||
recommended_action: str
|
||||
merge_confidence: float
|
||||
reasoning: str
|
||||
shared_elements: List[str]
|
||||
|
||||
|
||||
class DuplicationDetector:
|
||||
"""
|
||||
AI-driven duplication detector for identifying duplicate incidents
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.model_version = "v1.0"
|
||||
self.classifier = IncidentClassifier()
|
||||
|
||||
# Duplication thresholds
|
||||
self.duplication_thresholds = {
|
||||
'EXACT': 0.95,
|
||||
'NEAR_DUPLICATE': 0.85,
|
||||
'SIMILAR': 0.70,
|
||||
'POTENTIAL_DUPLICATE': 0.50
|
||||
}
|
||||
|
||||
# Action thresholds
|
||||
self.action_thresholds = {
|
||||
'MERGE': 0.90,
|
||||
'LINK': 0.75,
|
||||
'REVIEW': 0.60,
|
||||
'NO_ACTION': 0.0
|
||||
}
|
||||
|
||||
# Time windows for temporal proximity
|
||||
self.time_windows = {
|
||||
'EXACT': timedelta(minutes=30),
|
||||
'NEAR_DUPLICATE': timedelta(hours=2),
|
||||
'SIMILAR': timedelta(hours=24),
|
||||
'POTENTIAL_DUPLICATE': timedelta(days=7)
|
||||
}
|
||||
|
||||
def detect_duplication(self, incident_a: Dict, incident_b: Dict) -> Optional[DuplicationResult]:
|
||||
"""
|
||||
Detect if two incidents are duplicates
|
||||
"""
|
||||
# Calculate various similarity metrics
|
||||
text_similarity = self._calculate_text_similarity(incident_a, incident_b)
|
||||
temporal_proximity = self._calculate_temporal_proximity(incident_a, incident_b)
|
||||
service_similarity = self._calculate_service_similarity(incident_a, incident_b)
|
||||
metadata_similarity = self._calculate_metadata_similarity(incident_a, incident_b)
|
||||
|
||||
# Calculate overall similarity score
|
||||
overall_similarity = (
|
||||
text_similarity * 0.5 +
|
||||
temporal_proximity * 0.2 +
|
||||
service_similarity * 0.2 +
|
||||
metadata_similarity * 0.1
|
||||
)
|
||||
|
||||
# Determine duplication type
|
||||
duplication_type = self._determine_duplication_type(overall_similarity, text_similarity, temporal_proximity)
|
||||
|
||||
if duplication_type == 'NO_DUPLICATE':
|
||||
return None
|
||||
|
||||
# Calculate confidence score
|
||||
confidence_score = self._calculate_confidence_score(
|
||||
overall_similarity, text_similarity, temporal_proximity, service_similarity
|
||||
)
|
||||
|
||||
# Determine recommended action
|
||||
recommended_action = self._determine_recommended_action(confidence_score, duplication_type)
|
||||
|
||||
# Calculate merge confidence
|
||||
merge_confidence = self._calculate_merge_confidence(
|
||||
confidence_score, duplication_type, incident_a, incident_b
|
||||
)
|
||||
|
||||
# Generate reasoning
|
||||
reasoning = self._generate_reasoning(
|
||||
duplication_type, text_similarity, temporal_proximity, service_similarity
|
||||
)
|
||||
|
||||
# Extract shared elements
|
||||
shared_elements = self._extract_shared_elements(incident_a, incident_b)
|
||||
|
||||
return DuplicationResult(
|
||||
duplication_type=duplication_type,
|
||||
similarity_score=overall_similarity,
|
||||
confidence_score=confidence_score,
|
||||
text_similarity=text_similarity,
|
||||
temporal_proximity=temporal_proximity,
|
||||
service_similarity=service_similarity,
|
||||
recommended_action=recommended_action,
|
||||
merge_confidence=merge_confidence,
|
||||
reasoning=reasoning,
|
||||
shared_elements=shared_elements
|
||||
)
|
||||
|
||||
def _calculate_text_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate text similarity between incidents"""
|
||||
# Combine all text fields
|
||||
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')} {incident_a.get('free_text', '')}".lower()
|
||||
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')} {incident_b.get('free_text', '')}".lower()
|
||||
|
||||
# Calculate multiple similarity metrics
|
||||
jaccard_similarity = self._calculate_jaccard_similarity(text_a, text_b)
|
||||
cosine_similarity = self._calculate_cosine_similarity(text_a, text_b)
|
||||
phrase_similarity = self._calculate_phrase_similarity(text_a, text_b)
|
||||
semantic_similarity = self._calculate_semantic_similarity(text_a, text_b)
|
||||
|
||||
# Weighted combination
|
||||
return (
|
||||
jaccard_similarity * 0.3 +
|
||||
cosine_similarity * 0.3 +
|
||||
phrase_similarity * 0.2 +
|
||||
semantic_similarity * 0.2
|
||||
)
|
||||
|
||||
def _calculate_jaccard_similarity(self, text_a: str, text_b: str) -> float:
|
||||
"""Calculate Jaccard similarity based on word sets"""
|
||||
words_a = set(text_a.split())
|
||||
words_b = set(text_b.split())
|
||||
|
||||
if not words_a or not words_b:
|
||||
return 0.0
|
||||
|
||||
intersection = len(words_a.intersection(words_b))
|
||||
union = len(words_a.union(words_b))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _calculate_cosine_similarity(self, text_a: str, text_b: str) -> float:
|
||||
"""Calculate cosine similarity based on word frequency"""
|
||||
from collections import Counter
|
||||
|
||||
words_a = Counter(text_a.split())
|
||||
words_b = Counter(text_b.split())
|
||||
|
||||
# Get all unique words
|
||||
all_words = set(words_a.keys()) | set(words_b.keys())
|
||||
|
||||
if not all_words:
|
||||
return 0.0
|
||||
|
||||
# Create vectors
|
||||
vector_a = [words_a.get(word, 0) for word in all_words]
|
||||
vector_b = [words_b.get(word, 0) for word in all_words]
|
||||
|
||||
# Calculate cosine similarity
|
||||
dot_product = sum(a * b for a, b in zip(vector_a, vector_b))
|
||||
magnitude_a = sum(a * a for a in vector_a) ** 0.5
|
||||
magnitude_b = sum(b * b for b in vector_b) ** 0.5
|
||||
|
||||
if magnitude_a == 0 or magnitude_b == 0:
|
||||
return 0.0
|
||||
|
||||
return dot_product / (magnitude_a * magnitude_b)
|
||||
|
||||
def _calculate_phrase_similarity(self, text_a: str, text_b: str) -> float:
|
||||
"""Calculate similarity based on common phrases"""
|
||||
# Extract 2-3 word phrases
|
||||
phrases_a = set()
|
||||
phrases_b = set()
|
||||
|
||||
words_a = text_a.split()
|
||||
words_b = text_b.split()
|
||||
|
||||
# Extract 2-word phrases
|
||||
for i in range(len(words_a) - 1):
|
||||
phrases_a.add(f"{words_a[i]} {words_a[i+1]}")
|
||||
|
||||
for i in range(len(words_b) - 1):
|
||||
phrases_b.add(f"{words_b[i]} {words_b[i+1]}")
|
||||
|
||||
# Extract 3-word phrases
|
||||
for i in range(len(words_a) - 2):
|
||||
phrases_a.add(f"{words_a[i]} {words_a[i+1]} {words_a[i+2]}")
|
||||
|
||||
for i in range(len(words_b) - 2):
|
||||
phrases_b.add(f"{words_b[i]} {words_b[i+1]} {words_b[i+2]}")
|
||||
|
||||
if not phrases_a or not phrases_b:
|
||||
return 0.0
|
||||
|
||||
intersection = len(phrases_a.intersection(phrases_b))
|
||||
union = len(phrases_a.union(phrases_b))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _calculate_semantic_similarity(self, text_a: str, text_b: str) -> float:
|
||||
"""Calculate semantic similarity using keyword analysis"""
|
||||
# Extract keywords using the classifier
|
||||
keywords_a = set(self.classifier._extract_keywords(text_a))
|
||||
keywords_b = set(self.classifier._extract_keywords(text_b))
|
||||
|
||||
if not keywords_a or not keywords_b:
|
||||
return 0.0
|
||||
|
||||
# Calculate semantic similarity based on keyword overlap
|
||||
intersection = len(keywords_a.intersection(keywords_b))
|
||||
union = len(keywords_a.union(keywords_b))
|
||||
|
||||
base_similarity = intersection / union if union > 0 else 0.0
|
||||
|
||||
# Boost similarity for technical terms
|
||||
technical_terms = {
|
||||
'error', 'exception', 'timeout', 'connection', 'database', 'server',
|
||||
'api', 'service', 'application', 'network', 'storage', 'memory',
|
||||
'cpu', 'disk', 'bandwidth', 'latency', 'performance', 'crash'
|
||||
}
|
||||
|
||||
technical_intersection = len(keywords_a.intersection(keywords_b).intersection(technical_terms))
|
||||
if technical_intersection > 0:
|
||||
base_similarity += 0.1 * technical_intersection
|
||||
|
||||
return min(base_similarity, 1.0)
|
||||
|
||||
def _calculate_temporal_proximity(self, incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate temporal proximity between incidents"""
|
||||
created_a = incident_a.get('created_at')
|
||||
created_b = incident_b.get('created_at')
|
||||
|
||||
if not created_a or not created_b:
|
||||
return 0.0
|
||||
|
||||
# Convert to datetime if needed
|
||||
if isinstance(created_a, str):
|
||||
created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
|
||||
if isinstance(created_b, str):
|
||||
created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
|
||||
|
||||
time_diff = abs((created_a - created_b).total_seconds())
|
||||
|
||||
# Calculate proximity score based on time difference
|
||||
if time_diff <= 300: # 5 minutes
|
||||
return 1.0
|
||||
elif time_diff <= 1800: # 30 minutes
|
||||
return 0.9
|
||||
elif time_diff <= 3600: # 1 hour
|
||||
return 0.7
|
||||
elif time_diff <= 7200: # 2 hours
|
||||
return 0.5
|
||||
elif time_diff <= 86400: # 24 hours
|
||||
return 0.3
|
||||
elif time_diff <= 604800: # 7 days
|
||||
return 0.1
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
def _calculate_service_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate service/component similarity"""
|
||||
# Extract service information from text
|
||||
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
|
||||
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
|
||||
|
||||
# Service/component keywords
|
||||
service_keywords = [
|
||||
'api', 'service', 'database', 'server', 'application', 'website', 'mobile',
|
||||
'frontend', 'backend', 'microservice', 'gateway', 'load balancer', 'cache',
|
||||
'queue', 'message', 'notification', 'email', 'sms', 'payment', 'auth',
|
||||
'user service', 'order service', 'payment service', 'notification service'
|
||||
]
|
||||
|
||||
services_a = set()
|
||||
services_b = set()
|
||||
|
||||
for keyword in service_keywords:
|
||||
if keyword in text_a:
|
||||
services_a.add(keyword)
|
||||
if keyword in text_b:
|
||||
services_b.add(keyword)
|
||||
|
||||
if not services_a or not services_b:
|
||||
return 0.0
|
||||
|
||||
intersection = len(services_a.intersection(services_b))
|
||||
union = len(services_a.union(services_b))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _calculate_metadata_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate similarity based on metadata fields"""
|
||||
similarity_score = 0.0
|
||||
total_fields = 0
|
||||
|
||||
# Compare severity
|
||||
if incident_a.get('severity') == incident_b.get('severity'):
|
||||
similarity_score += 1.0
|
||||
total_fields += 1
|
||||
|
||||
# Compare status
|
||||
if incident_a.get('status') == incident_b.get('status'):
|
||||
similarity_score += 1.0
|
||||
total_fields += 1
|
||||
|
||||
# Compare category
|
||||
if incident_a.get('category') == incident_b.get('category'):
|
||||
similarity_score += 1.0
|
||||
total_fields += 1
|
||||
|
||||
# Compare assigned user
|
||||
if incident_a.get('assigned_to') == incident_b.get('assigned_to'):
|
||||
similarity_score += 1.0
|
||||
total_fields += 1
|
||||
|
||||
# Compare reporter
|
||||
if incident_a.get('reporter') == incident_b.get('reporter'):
|
||||
similarity_score += 1.0
|
||||
total_fields += 1
|
||||
|
||||
return similarity_score / total_fields if total_fields > 0 else 0.0
|
||||
|
||||
def _determine_duplication_type(self, overall_similarity: float, text_similarity: float,
|
||||
temporal_proximity: float) -> str:
|
||||
"""Determine the type of duplication"""
|
||||
if overall_similarity >= self.duplication_thresholds['EXACT']:
|
||||
return 'EXACT'
|
||||
elif overall_similarity >= self.duplication_thresholds['NEAR_DUPLICATE']:
|
||||
return 'NEAR_DUPLICATE'
|
||||
elif overall_similarity >= self.duplication_thresholds['SIMILAR']:
|
||||
return 'SIMILAR'
|
||||
elif overall_similarity >= self.duplication_thresholds['POTENTIAL_DUPLICATE']:
|
||||
return 'POTENTIAL_DUPLICATE'
|
||||
else:
|
||||
return 'NO_DUPLICATE'
|
||||
|
||||
def _calculate_confidence_score(self, overall_similarity: float, text_similarity: float,
|
||||
temporal_proximity: float, service_similarity: float) -> float:
|
||||
"""Calculate confidence score for duplication detection"""
|
||||
base_confidence = overall_similarity
|
||||
|
||||
# Boost confidence for high text similarity
|
||||
if text_similarity > 0.8:
|
||||
base_confidence += 0.1
|
||||
|
||||
# Boost confidence for high temporal proximity
|
||||
if temporal_proximity > 0.8:
|
||||
base_confidence += 0.1
|
||||
|
||||
# Boost confidence for high service similarity
|
||||
if service_similarity > 0.8:
|
||||
base_confidence += 0.05
|
||||
|
||||
return min(base_confidence, 1.0)
|
||||
|
||||
def _determine_recommended_action(self, confidence_score: float, duplication_type: str) -> str:
|
||||
"""Determine recommended action based on confidence and duplication type"""
|
||||
if confidence_score >= self.action_thresholds['MERGE']:
|
||||
return 'MERGE'
|
||||
elif confidence_score >= self.action_thresholds['LINK']:
|
||||
return 'LINK'
|
||||
elif confidence_score >= self.action_thresholds['REVIEW']:
|
||||
return 'REVIEW'
|
||||
else:
|
||||
return 'NO_ACTION'
|
||||
|
||||
def _calculate_merge_confidence(self, confidence_score: float, duplication_type: str,
|
||||
incident_a: Dict, incident_b: Dict) -> float:
|
||||
"""Calculate confidence for merging incidents"""
|
||||
merge_confidence = confidence_score
|
||||
|
||||
# Adjust based on duplication type
|
||||
type_adjustments = {
|
||||
'EXACT': 0.1,
|
||||
'NEAR_DUPLICATE': 0.05,
|
||||
'SIMILAR': 0.0,
|
||||
'POTENTIAL_DUPLICATE': -0.1
|
||||
}
|
||||
|
||||
merge_confidence += type_adjustments.get(duplication_type, 0.0)
|
||||
|
||||
# Adjust based on incident status
|
||||
if incident_a.get('status') == incident_b.get('status'):
|
||||
merge_confidence += 0.05
|
||||
|
||||
# Adjust based on severity
|
||||
if incident_a.get('severity') == incident_b.get('severity'):
|
||||
merge_confidence += 0.03
|
||||
|
||||
return min(max(merge_confidence, 0.0), 1.0)
|
||||
|
||||
def _generate_reasoning(self, duplication_type: str, text_similarity: float,
|
||||
temporal_proximity: float, service_similarity: float) -> str:
|
||||
"""Generate human-readable reasoning for duplication detection"""
|
||||
reasoning_parts = []
|
||||
|
||||
if text_similarity > 0.8:
|
||||
reasoning_parts.append(f"Very high text similarity ({text_similarity:.1%})")
|
||||
elif text_similarity > 0.6:
|
||||
reasoning_parts.append(f"High text similarity ({text_similarity:.1%})")
|
||||
elif text_similarity > 0.4:
|
||||
reasoning_parts.append(f"Moderate text similarity ({text_similarity:.1%})")
|
||||
|
||||
if temporal_proximity > 0.8:
|
||||
reasoning_parts.append(f"Very close temporal proximity ({temporal_proximity:.1%})")
|
||||
elif temporal_proximity > 0.6:
|
||||
reasoning_parts.append(f"Close temporal proximity ({temporal_proximity:.1%})")
|
||||
|
||||
if service_similarity > 0.8:
|
||||
reasoning_parts.append(f"Very high service similarity ({service_similarity:.1%})")
|
||||
elif service_similarity > 0.6:
|
||||
reasoning_parts.append(f"High service similarity ({service_similarity:.1%})")
|
||||
|
||||
if duplication_type == 'EXACT':
|
||||
reasoning_parts.append("Incidents appear to be exact duplicates")
|
||||
elif duplication_type == 'NEAR_DUPLICATE':
|
||||
reasoning_parts.append("Incidents appear to be near duplicates")
|
||||
elif duplication_type == 'SIMILAR':
|
||||
reasoning_parts.append("Incidents appear to be similar")
|
||||
elif duplication_type == 'POTENTIAL_DUPLICATE':
|
||||
reasoning_parts.append("Incidents may be duplicates")
|
||||
|
||||
return "; ".join(reasoning_parts) if reasoning_parts else "Based on overall similarity analysis"
|
||||
|
||||
def _extract_shared_elements(self, incident_a: Dict, incident_b: Dict) -> List[str]:
|
||||
"""Extract elements shared between incidents"""
|
||||
shared_elements = []
|
||||
|
||||
# Shared keywords
|
||||
text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
|
||||
text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
|
||||
|
||||
keywords_a = set(self.classifier._extract_keywords(text_a))
|
||||
keywords_b = set(self.classifier._extract_keywords(text_b))
|
||||
shared_keywords = keywords_a.intersection(keywords_b)
|
||||
|
||||
if shared_keywords:
|
||||
shared_elements.append(f"Keywords: {', '.join(list(shared_keywords)[:5])}")
|
||||
|
||||
# Shared services
|
||||
service_keywords = [
|
||||
'api', 'service', 'database', 'server', 'application', 'website', 'mobile'
|
||||
]
|
||||
|
||||
services_a = set()
|
||||
services_b = set()
|
||||
|
||||
for keyword in service_keywords:
|
||||
if keyword in text_a:
|
||||
services_a.add(keyword)
|
||||
if keyword in text_b:
|
||||
services_b.add(keyword)
|
||||
|
||||
shared_services = services_a.intersection(services_b)
|
||||
if shared_services:
|
||||
shared_elements.append(f"Services: {', '.join(shared_services)}")
|
||||
|
||||
# Shared metadata
|
||||
if incident_a.get('severity') == incident_b.get('severity'):
|
||||
shared_elements.append(f"Severity: {incident_a.get('severity')}")
|
||||
|
||||
if incident_a.get('category') == incident_b.get('category'):
|
||||
shared_elements.append(f"Category: {incident_a.get('category')}")
|
||||
|
||||
if incident_a.get('status') == incident_b.get('status'):
|
||||
shared_elements.append(f"Status: {incident_a.get('status')}")
|
||||
|
||||
return shared_elements
|
||||
|
||||
def find_duplicate_candidates(self, target_incident: Dict, all_incidents: List[Dict],
|
||||
limit: int = 10) -> List[Tuple[Dict, DuplicationResult]]:
|
||||
"""Find incidents that might be duplicates of the target incident"""
|
||||
candidates = []
|
||||
|
||||
for incident in all_incidents:
|
||||
if incident['id'] == target_incident['id']:
|
||||
continue
|
||||
|
||||
duplication = self.detect_duplication(target_incident, incident)
|
||||
if duplication:
|
||||
candidates.append((incident, duplication))
|
||||
|
||||
# Sort by confidence score and return top results
|
||||
candidates.sort(key=lambda x: x[1].confidence_score, reverse=True)
|
||||
return candidates[:limit]
|
||||
|
||||
def batch_detect_duplicates(self, incidents: List[Dict]) -> List[Tuple[Dict, Dict, DuplicationResult]]:
|
||||
"""Batch detect duplicates in a list of incidents"""
|
||||
duplicates = []
|
||||
processed_pairs = set()
|
||||
|
||||
for i, incident_a in enumerate(incidents):
|
||||
for j, incident_b in enumerate(incidents[i+1:], i+1):
|
||||
# Create a unique pair identifier
|
||||
pair_id = tuple(sorted([incident_a['id'], incident_b['id']]))
|
||||
|
||||
if pair_id in processed_pairs:
|
||||
continue
|
||||
|
||||
processed_pairs.add(pair_id)
|
||||
|
||||
duplication = self.detect_duplication(incident_a, incident_b)
|
||||
if duplication:
|
||||
duplicates.append((incident_a, incident_b, duplication))
|
||||
|
||||
# Sort by confidence score
|
||||
duplicates.sort(key=lambda x: x[2].confidence_score, reverse=True)
|
||||
return duplicates
|
||||
Reference in New Issue
Block a user