Updates

2025-09-19 11:58:53 +03:00
parent 306b20e24a
commit 6b247e5b9f
11423 changed files with 1500615 additions and 778 deletions
--- a/ETB-API/incident_intelligence/ai/init.py
+++ b/ETB-API/incident_intelligence/ai/init.py
@@ -0,0 +1 @@
+# AI components for incident intelligence
--- a/ETB-API/incident_intelligence/ai/pycache/init.cpython-312.pyc
+++ b/ETB-API/incident_intelligence/ai/pycache/init.cpython-312.pyc
--- a/ETB-API/incident_intelligence/ai/pycache/classification.cpython-312.pyc
+++ b/ETB-API/incident_intelligence/ai/pycache/classification.cpython-312.pyc
--- a/ETB-API/incident_intelligence/ai/pycache/correlation.cpython-312.pyc
+++ b/ETB-API/incident_intelligence/ai/pycache/correlation.cpython-312.pyc
--- a/ETB-API/incident_intelligence/ai/pycache/duplication.cpython-312.pyc
+++ b/ETB-API/incident_intelligence/ai/pycache/duplication.cpython-312.pyc
--- a/ETB-API/incident_intelligence/ai/classification.py
+++ b/ETB-API/incident_intelligence/ai/classification.py
@@ -0,0 +1,471 @@
+"""
+AI-driven incident classification using NLP techniques
+"""
+import re
+import time
+from typing import Dict, List, Tuple, Optional
+from dataclasses import dataclass
+from django.conf import settings
+
+
+@dataclass
+class ClassificationResult:
+    """Result of incident classification"""
+    category: str
+    subcategory: str
+    confidence: float
+    alternative_categories: List[Dict[str, float]]
+    keywords: List[str]
+    sentiment_score: float
+    urgency_indicators: List[str]
+
+
+class IncidentClassifier:
+    """
+    AI-driven incident classifier using rule-based and ML techniques
+    """
+    
+    def __init__(self):
+        self.model_version = "v1.0"
+        
+        # Predefined categories and their keywords
+        self.categories = {
+            'INFRASTRUCTURE': {
+                'keywords': ['server', 'database', 'network', 'storage', 'disk', 'memory', 'cpu', 'load', 'bandwidth', 'connection', 'timeout', 'latency'],
+                'subcategories': {
+                    'SERVER_ISSUE': ['server', 'host', 'machine', 'instance', 'vm', 'container'],
+                    'DATABASE_ISSUE': ['database', 'db', 'sql', 'query', 'connection', 'timeout', 'deadlock'],
+                    'NETWORK_ISSUE': ['network', 'connectivity', 'dns', 'firewall', 'routing', 'packet', 'bandwidth'],
+                    'STORAGE_ISSUE': ['storage', 'disk', 'volume', 'space', 'capacity', 'i/o', 'read', 'write'],
+                }
+            },
+            'APPLICATION': {
+                'keywords': ['application', 'app', 'service', 'api', 'endpoint', 'response', 'error', 'exception', 'crash', 'bug'],
+                'subcategories': {
+                    'API_ISSUE': ['api', 'endpoint', 'response', 'status', 'code', 'timeout', 'rate', 'limit'],
+                    'SERVICE_ISSUE': ['service', 'microservice', 'dependency', 'circuit', 'breaker', 'fallback'],
+                    'PERFORMANCE_ISSUE': ['performance', 'slow', 'latency', 'response', 'time', 'throughput', 'bottleneck'],
+                    'FUNCTIONALITY_ISSUE': ['bug', 'feature', 'functionality', 'behavior', 'unexpected', 'incorrect'],
+                }
+            },
+            'SECURITY': {
+                'keywords': ['security', 'authentication', 'authorization', 'access', 'permission', 'breach', 'attack', 'vulnerability', 'malware'],
+                'subcategories': {
+                    'AUTH_ISSUE': ['authentication', 'login', 'password', 'token', 'session', 'credential'],
+                    'ACCESS_ISSUE': ['authorization', 'permission', 'access', 'denied', 'forbidden', 'unauthorized'],
+                    'THREAT_ISSUE': ['attack', 'breach', 'malware', 'virus', 'intrusion', 'suspicious', 'anomaly'],
+                    'VULNERABILITY': ['vulnerability', 'exploit', 'patch', 'update', 'security', 'fix'],
+                }
+            },
+            'USER_EXPERIENCE': {
+                'keywords': ['user', 'interface', 'ui', 'ux', 'experience', 'usability', 'navigation', 'button', 'form', 'page'],
+                'subcategories': {
+                    'UI_ISSUE': ['interface', 'ui', 'button', 'form', 'page', 'layout', 'display', 'rendering'],
+                    'NAVIGATION_ISSUE': ['navigation', 'menu', 'link', 'redirect', 'routing', 'page', 'not', 'found'],
+                    'USABILITY_ISSUE': ['usability', 'experience', 'confusing', 'difficult', 'unclear', 'intuitive'],
+                    'MOBILE_ISSUE': ['mobile', 'app', 'responsive', 'device', 'screen', 'touch', 'gesture'],
+                }
+            },
+            'DATA': {
+                'keywords': ['data', 'file', 'import', 'export', 'sync', 'backup', 'recovery', 'corruption', 'missing', 'duplicate'],
+                'subcategories': {
+                    'DATA_CORRUPTION': ['corruption', 'corrupted', 'invalid', 'malformed', 'broken', 'damaged'],
+                    'DATA_LOSS': ['missing', 'lost', 'deleted', 'removed', 'disappeared', 'not', 'found'],
+                    'SYNC_ISSUE': ['sync', 'synchronization', 'conflict', 'merge', 'update', 'latest'],
+                    'BACKUP_ISSUE': ['backup', 'restore', 'recovery', 'archive', 'retention', 'storage'],
+                }
+            },
+            'INTEGRATION': {
+                'keywords': ['integration', 'third-party', 'external', 'webhook', 'api', 'connection', 'sync', 'import', 'export'],
+                'subcategories': {
+                    'THIRD_PARTY_ISSUE': ['third-party', 'external', 'vendor', 'partner', 'service', 'provider'],
+                    'WEBHOOK_ISSUE': ['webhook', 'callback', 'notification', 'event', 'trigger', 'delivery'],
+                    'API_INTEGRATION': ['api', 'integration', 'endpoint', 'connection', 'authentication', 'response'],
+                    'DATA_INTEGRATION': ['import', 'export', 'migration', 'transformation', 'mapping', 'format'],
+                }
+            }
+        }
+        
+        # Urgency indicators
+        self.urgency_indicators = {
+            'CRITICAL': ['down', 'outage', 'critical', 'emergency', 'urgent', 'immediate', 'severe', 'complete', 'total'],
+            'HIGH': ['major', 'significant', 'important', 'priority', 'escalate', 'escalated', 'blocking'],
+            'MEDIUM': ['moderate', 'some', 'partial', 'intermittent', 'occasional', 'sometimes'],
+            'LOW': ['minor', 'small', 'cosmetic', 'enhancement', 'improvement', 'suggestion']
+        }
+        
+        # Sentiment analysis keywords
+        self.sentiment_keywords = {
+            'positive': ['working', 'fixed', 'resolved', 'good', 'excellent', 'improved', 'better', 'success'],
+            'negative': ['broken', 'failed', 'error', 'issue', 'problem', 'bug', 'crash', 'down', 'slow', 'terrible', 'awful'],
+            'neutral': ['report', 'incident', 'ticket', 'request', 'update', 'status', 'information']
+        }
+
+    def classify_incident(self, title: str, description: str, free_text: str = "") -> ClassificationResult:
+        """
+        Classify an incident based on its text content
+        """
+        start_time = time.time()
+        
+        # Combine all text for analysis
+        combined_text = f"{title} {description} {free_text}".lower()
+        
+        # Extract keywords
+        keywords = self._extract_keywords(combined_text)
+        
+        # Analyze sentiment
+        sentiment_score = self._analyze_sentiment(combined_text)
+        
+        # Detect urgency indicators
+        urgency_indicators = self._detect_urgency_indicators(combined_text)
+        
+        # Classify category and subcategory
+        category, subcategory, confidence, alternatives = self._classify_category(combined_text, keywords)
+        
+        processing_time = time.time() - start_time
+        
+        return ClassificationResult(
+            category=category,
+            subcategory=subcategory,
+            confidence=confidence,
+            alternative_categories=alternatives,
+            keywords=keywords,
+            sentiment_score=sentiment_score,
+            urgency_indicators=urgency_indicators
+        )
+
+    def _extract_keywords(self, text: str) -> List[str]:
+        """Extract relevant keywords from text"""
+        # Simple keyword extraction - in production, use more sophisticated NLP
+        words = re.findall(r'\b[a-zA-Z]{3,}\b', text)
+        
+        # Filter out common stop words
+        stop_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'a', 'an'}
+        
+        keywords = [word for word in words if word not in stop_words]
+        
+        # Count frequency and return top keywords
+        from collections import Counter
+        keyword_counts = Counter(keywords)
+        return [word for word, count in keyword_counts.most_common(10)]
+
+    def _analyze_sentiment(self, text: str) -> float:
+        """Analyze sentiment of the text (-1 to 1)"""
+        positive_count = sum(1 for word in self.sentiment_keywords['positive'] if word in text)
+        negative_count = sum(1 for word in self.sentiment_keywords['negative'] if word in text)
+        
+        total_sentiment_words = positive_count + negative_count
+        if total_sentiment_words == 0:
+            return 0.0
+        
+        return (positive_count - negative_count) / total_sentiment_words
+
+    def _detect_urgency_indicators(self, text: str) -> List[str]:
+        """Detect urgency indicators in the text"""
+        detected_indicators = []
+        
+        for urgency_level, indicators in self.urgency_indicators.items():
+            for indicator in indicators:
+                if indicator in text:
+                    detected_indicators.append(f"{urgency_level}: {indicator}")
+        
+        return detected_indicators
+
+    def _classify_category(self, text: str, keywords: List[str]) -> Tuple[str, str, float, List[Dict[str, float]]]:
+        """Classify the incident category and subcategory"""
+        category_scores = {}
+        subcategory_scores = {}
+        
+        # Score each category based on keyword matches
+        for category, data in self.categories.items():
+            score = 0
+            category_keywords = data['keywords']
+            
+            # Count keyword matches
+            for keyword in category_keywords:
+                if keyword in text:
+                    score += 1
+                # Also check for partial matches in keywords list
+                for extracted_keyword in keywords:
+                    if keyword in extracted_keyword or extracted_keyword in keyword:
+                        score += 0.5
+            
+            category_scores[category] = score
+            
+            # Score subcategories
+            for subcategory, subcategory_keywords in data['subcategories'].items():
+                subcategory_score = 0
+                for keyword in subcategory_keywords:
+                    if keyword in text:
+                        subcategory_score += 1
+                    for extracted_keyword in keywords:
+                        if keyword in extracted_keyword or extracted_keyword in keyword:
+                            subcategory_score += 0.5
+                
+                subcategory_scores[subcategory] = subcategory_score
+        
+        # Find best category
+        if not category_scores or max(category_scores.values()) == 0:
+            best_category = 'GENERAL'
+            best_subcategory = 'UNKNOWN'
+            confidence = 0.1
+        else:
+            best_category = max(category_scores, key=category_scores.get)
+            max_score = max(category_scores.values())
+            confidence = min(max_score / 10.0, 1.0)  # Normalize to 0-1
+            
+            # Find best subcategory within the category
+            if best_category in self.categories:
+                category_subcategories = self.categories[best_category]['subcategories']
+                subcategory_scores_filtered = {k: v for k, v in subcategory_scores.items() if k in category_subcategories}
+                
+                if subcategory_scores_filtered and max(subcategory_scores_filtered.values()) > 0:
+                    best_subcategory = max(subcategory_scores_filtered, key=subcategory_scores_filtered.get)
+                else:
+                    best_subcategory = 'GENERAL'
+            else:
+                best_subcategory = 'GENERAL'
+        
+        # Create alternative categories
+        alternatives = []
+        sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)
+        for category, score in sorted_categories[:3]:
+            if category != best_category and score > 0:
+                alternatives.append({
+                    'category': category,
+                    'confidence': min(score / 10.0, 1.0)
+                })
+        
+        return best_category, best_subcategory, confidence, alternatives
+
+
+class SeverityAnalyzer:
+    """
+    AI-driven severity analyzer based on impact assessment
+    """
+    
+    def __init__(self):
+        self.model_version = "v1.0"
+        
+        # Severity indicators
+        self.severity_indicators = {
+            'EMERGENCY': {
+                'keywords': ['down', 'outage', 'critical', 'emergency', 'complete', 'total', 'all', 'entire', 'system'],
+                'impact_multiplier': 2.0,
+                'user_impact_threshold': 0.8,
+                'business_impact_threshold': 0.9
+            },
+            'CRITICAL': {
+                'keywords': ['major', 'significant', 'severe', 'blocking', 'cannot', 'unable', 'failed', 'broken'],
+                'impact_multiplier': 1.5,
+                'user_impact_threshold': 0.6,
+                'business_impact_threshold': 0.7
+            },
+            'HIGH': {
+                'keywords': ['important', 'priority', 'escalate', 'escalated', 'urgent', 'immediate', 'soon'],
+                'impact_multiplier': 1.2,
+                'user_impact_threshold': 0.4,
+                'business_impact_threshold': 0.5
+            },
+            'MEDIUM': {
+                'keywords': ['moderate', 'some', 'partial', 'intermittent', 'occasional', 'sometimes', 'minor'],
+                'impact_multiplier': 1.0,
+                'user_impact_threshold': 0.2,
+                'business_impact_threshold': 0.3
+            },
+            'LOW': {
+                'keywords': ['small', 'cosmetic', 'enhancement', 'improvement', 'suggestion', 'nice', 'to', 'have'],
+                'impact_multiplier': 0.5,
+                'user_impact_threshold': 0.1,
+                'business_impact_threshold': 0.1
+            }
+        }
+
+    def analyze_severity(self, incident_data: Dict) -> Dict:
+        """
+        Analyze incident severity based on various factors
+        """
+        start_time = time.time()
+        
+        title = incident_data.get('title', '').lower()
+        description = incident_data.get('description', '').lower()
+        free_text = incident_data.get('free_text', '').lower()
+        affected_users = incident_data.get('affected_users', 0)
+        business_impact = incident_data.get('business_impact', '').lower()
+        
+        combined_text = f"{title} {description} {free_text} {business_impact}"
+        
+        # Calculate impact scores
+        user_impact_score = self._calculate_user_impact(affected_users, combined_text)
+        business_impact_score = self._calculate_business_impact(business_impact, combined_text)
+        technical_impact_score = self._calculate_technical_impact(combined_text)
+        
+        # Determine severity based on impact scores and keywords
+        suggested_severity, confidence, reasoning, impact_factors = self._determine_severity(
+            combined_text, user_impact_score, business_impact_score, technical_impact_score
+        )
+        
+        processing_time = time.time() - start_time
+        
+        return {
+            'suggested_severity': suggested_severity,
+            'confidence_score': confidence,
+            'user_impact_score': user_impact_score,
+            'business_impact_score': business_impact_score,
+            'technical_impact_score': technical_impact_score,
+            'reasoning': reasoning,
+            'impact_factors': impact_factors,
+            'processing_time': processing_time
+        }
+
+    def _calculate_user_impact(self, affected_users: int, text: str) -> float:
+        """Calculate user impact score (0-1)"""
+        # Base score from affected users count
+        if affected_users == 0:
+            # Try to extract from text
+            user_indicators = ['all users', 'everyone', 'entire user base', 'all customers']
+            if any(indicator in text for indicator in user_indicators):
+                base_score = 0.9
+            else:
+                base_score = 0.1
+        elif affected_users < 10:
+            base_score = 0.2
+        elif affected_users < 100:
+            base_score = 0.4
+        elif affected_users < 1000:
+            base_score = 0.6
+        elif affected_users < 10000:
+            base_score = 0.8
+        else:
+            base_score = 1.0
+        
+        # Adjust based on text indicators
+        if 'all' in text or 'everyone' in text:
+            base_score = min(base_score + 0.2, 1.0)
+        elif 'some' in text or 'few' in text:
+            base_score = max(base_score - 0.1, 0.0)
+        
+        return base_score
+
+    def _calculate_business_impact(self, business_impact: str, text: str) -> float:
+        """Calculate business impact score (0-1)"""
+        if not business_impact:
+            # Try to infer from text
+            high_impact_indicators = ['revenue', 'sales', 'customer', 'business', 'critical', 'essential', 'production']
+            if any(indicator in text for indicator in high_impact_indicators):
+                return 0.6
+            return 0.3
+        
+        # Analyze business impact text
+        high_impact_keywords = ['revenue', 'sales', 'customer', 'business', 'critical', 'essential', 'production', 'outage', 'down']
+        medium_impact_keywords = ['service', 'feature', 'functionality', 'performance', 'slow']
+        low_impact_keywords = ['cosmetic', 'minor', 'enhancement', 'improvement']
+        
+        score = 0.3  # Base score
+        
+        for keyword in high_impact_keywords:
+            if keyword in business_impact:
+                score += 0.1
+        
+        for keyword in medium_impact_keywords:
+            if keyword in business_impact:
+                score += 0.05
+        
+        for keyword in low_impact_keywords:
+            if keyword in business_impact:
+                score -= 0.05
+        
+        return min(max(score, 0.0), 1.0)
+
+    def _calculate_technical_impact(self, text: str) -> float:
+        """Calculate technical impact score (0-1)"""
+        technical_indicators = {
+            'high': ['down', 'outage', 'crash', 'failed', 'broken', 'unavailable', 'error', 'exception'],
+            'medium': ['slow', 'performance', 'latency', 'timeout', 'intermittent', 'partial'],
+            'low': ['cosmetic', 'display', 'ui', 'minor', 'enhancement']
+        }
+        
+        score = 0.3  # Base score
+        
+        for level, keywords in technical_indicators.items():
+            for keyword in keywords:
+                if keyword in text:
+                    if level == 'high':
+                        score += 0.15
+                    elif level == 'medium':
+                        score += 0.08
+                    elif level == 'low':
+                        score -= 0.05
+        
+        return min(max(score, 0.0), 1.0)
+
+    def _determine_severity(self, text: str, user_impact: float, business_impact: float, technical_impact: float) -> Tuple[str, float, str, List[str]]:
+        """Determine severity based on impact scores and text analysis"""
+        impact_factors = []
+        
+        # Calculate weighted impact score
+        weighted_score = (user_impact * 0.4 + business_impact * 0.4 + technical_impact * 0.2)
+        
+        # Check for severity indicators in text
+        severity_scores = {}
+        for severity, data in self.severity_indicators.items():
+            score = 0
+            for keyword in data['keywords']:
+                if keyword in text:
+                    score += 1
+            
+            # Apply impact multiplier
+            score *= data['impact_multiplier']
+            severity_scores[severity] = score
+        
+        # Find best severity match
+        if severity_scores and max(severity_scores.values()) > 0:
+            best_severity = max(severity_scores, key=severity_scores.get)
+            text_confidence = min(max(severity_scores.values()) / 5.0, 1.0)
+        else:
+            # Fallback to impact-based severity
+            if weighted_score >= 0.8:
+                best_severity = 'CRITICAL'
+            elif weighted_score >= 0.6:
+                best_severity = 'HIGH'
+            elif weighted_score >= 0.4:
+                best_severity = 'MEDIUM'
+            else:
+                best_severity = 'LOW'
+            text_confidence = 0.5
+        
+        # Combine text and impact confidence
+        confidence = (text_confidence + (1.0 - abs(weighted_score - self._severity_to_score(best_severity)))) / 2.0
+        
+        # Generate reasoning
+        reasoning_parts = []
+        if user_impact > 0.6:
+            reasoning_parts.append(f"High user impact ({user_impact:.1%})")
+            impact_factors.append(f"User Impact: {user_impact:.1%}")
+        
+        if business_impact > 0.6:
+            reasoning_parts.append(f"Significant business impact ({business_impact:.1%})")
+            impact_factors.append(f"Business Impact: {business_impact:.1%}")
+        
+        if technical_impact > 0.6:
+            reasoning_parts.append(f"Major technical impact ({technical_impact:.1%})")
+            impact_factors.append(f"Technical Impact: {technical_impact:.1%}")
+        
+        if severity_scores and max(severity_scores.values()) > 0:
+            reasoning_parts.append("Severity indicators detected in incident description")
+            impact_factors.append("Text Analysis: Severity keywords found")
+        
+        reasoning = "; ".join(reasoning_parts) if reasoning_parts else "Based on overall impact assessment"
+        
+        return best_severity, confidence, reasoning, impact_factors
+
+    def _severity_to_score(self, severity: str) -> float:
+        """Convert severity level to numeric score"""
+        severity_scores = {
+            'LOW': 0.2,
+            'MEDIUM': 0.4,
+            'HIGH': 0.6,
+            'CRITICAL': 0.8,
+            'EMERGENCY': 1.0
+        }
+        return severity_scores.get(severity, 0.4)
--- a/ETB-API/incident_intelligence/ai/correlation.py
+++ b/ETB-API/incident_intelligence/ai/correlation.py
@@ -0,0 +1,481 @@
+"""
+Correlation engine for linking related incidents and problem detection
+"""
+import time
+from typing import Dict, List, Tuple, Optional
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from django.utils import timezone
+from .classification import IncidentClassifier
+
+
+@dataclass
+class CorrelationResult:
+    """Result of incident correlation analysis"""
+    correlation_type: str
+    confidence_score: float
+    correlation_strength: str
+    shared_keywords: List[str]
+    time_difference: timedelta
+    similarity_score: float
+    is_problem_indicator: bool
+    problem_description: Optional[str]
+
+
+class IncidentCorrelationEngine:
+    """
+    AI-driven correlation engine for linking related incidents
+    """
+    
+    def __init__(self):
+        self.model_version = "v1.0"
+        self.classifier = IncidentClassifier()
+        
+        # Correlation thresholds
+        self.correlation_thresholds = {
+            'VERY_STRONG': 0.9,
+            'STRONG': 0.7,
+            'MODERATE': 0.5,
+            'WEAK': 0.3
+        }
+        
+        # Problem detection patterns
+        self.problem_patterns = {
+            'CASCADE_FAILURE': {
+                'keywords': ['cascade', 'chain', 'reaction', 'domino', 'ripple', 'effect'],
+                'time_window': timedelta(hours=2),
+                'min_incidents': 3
+            },
+            'RECURRING_ISSUE': {
+                'keywords': ['same', 'again', 'recurring', 'repeated', 'similar', 'identical'],
+                'time_window': timedelta(days=7),
+                'min_incidents': 2
+            },
+            'SERVICE_DEPENDENCY': {
+                'keywords': ['dependency', 'dependent', 'downstream', 'upstream', 'service', 'api'],
+                'time_window': timedelta(hours=1),
+                'min_incidents': 2
+            },
+            'INFRASTRUCTURE_PATTERN': {
+                'keywords': ['server', 'database', 'network', 'storage', 'infrastructure'],
+                'time_window': timedelta(hours=4),
+                'min_incidents': 3
+            }
+        }
+
+    def correlate_incidents(self, incident_a: Dict, incident_b: Dict) -> Optional[CorrelationResult]:
+        """
+        Correlate two incidents and determine if they are related
+        """
+        # Calculate various similarity metrics
+        text_similarity = self._calculate_text_similarity(incident_a, incident_b)
+        temporal_similarity = self._calculate_temporal_similarity(incident_a, incident_b)
+        service_similarity = self._calculate_service_similarity(incident_a, incident_b)
+        category_similarity = self._calculate_category_similarity(incident_a, incident_b)
+        
+        # Calculate overall similarity score
+        overall_similarity = (
+            text_similarity * 0.4 +
+            temporal_similarity * 0.2 +
+            service_similarity * 0.2 +
+            category_similarity * 0.2
+        )
+        
+        # Determine if incidents are correlated
+        if overall_similarity < 0.3:
+            return None
+        
+        # Determine correlation type
+        correlation_type = self._determine_correlation_type(
+            incident_a, incident_b, text_similarity, temporal_similarity, service_similarity
+        )
+        
+        # Calculate confidence score
+        confidence_score = self._calculate_confidence_score(
+            overall_similarity, correlation_type, incident_a, incident_b
+        )
+        
+        # Determine correlation strength
+        correlation_strength = self._determine_correlation_strength(confidence_score)
+        
+        # Extract shared keywords
+        shared_keywords = self._extract_shared_keywords(incident_a, incident_b)
+        
+        # Calculate time difference
+        time_diff = self._calculate_time_difference(incident_a, incident_b)
+        
+        # Check for problem indicators
+        is_problem_indicator, problem_description = self._detect_problem_patterns(
+            incident_a, incident_b, correlation_type, confidence_score
+        )
+        
+        return CorrelationResult(
+            correlation_type=correlation_type,
+            confidence_score=confidence_score,
+            correlation_strength=correlation_strength,
+            shared_keywords=shared_keywords,
+            time_difference=time_diff,
+            similarity_score=overall_similarity,
+            is_problem_indicator=is_problem_indicator,
+            problem_description=problem_description
+        )
+
+    def _calculate_text_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate text similarity between two incidents"""
+        # Combine text fields
+        text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')} {incident_a.get('free_text', '')}".lower()
+        text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')} {incident_b.get('free_text', '')}".lower()
+        
+        # Extract keywords
+        keywords_a = set(self.classifier._extract_keywords(text_a))
+        keywords_b = set(self.classifier._extract_keywords(text_b))
+        
+        if not keywords_a or not keywords_b:
+            return 0.0
+        
+        # Calculate Jaccard similarity
+        intersection = len(keywords_a.intersection(keywords_b))
+        union = len(keywords_a.union(keywords_b))
+        
+        jaccard_similarity = intersection / union if union > 0 else 0.0
+        
+        # Also check for exact phrase matches
+        phrase_similarity = self._calculate_phrase_similarity(text_a, text_b)
+        
+        # Combine similarities
+        return (jaccard_similarity * 0.7 + phrase_similarity * 0.3)
+
+    def _calculate_phrase_similarity(self, text_a: str, text_b: str) -> float:
+        """Calculate similarity based on common phrases"""
+        # Extract 2-3 word phrases
+        phrases_a = set()
+        phrases_b = set()
+        
+        words_a = text_a.split()
+        words_b = text_b.split()
+        
+        # Extract 2-word phrases
+        for i in range(len(words_a) - 1):
+            phrases_a.add(f"{words_a[i]} {words_a[i+1]}")
+        
+        for i in range(len(words_b) - 1):
+            phrases_b.add(f"{words_b[i]} {words_b[i+1]}")
+        
+        # Extract 3-word phrases
+        for i in range(len(words_a) - 2):
+            phrases_a.add(f"{words_a[i]} {words_a[i+1]} {words_a[i+2]}")
+        
+        for i in range(len(words_b) - 2):
+            phrases_b.add(f"{words_b[i]} {words_b[i+1]} {words_b[i+2]}")
+        
+        if not phrases_a or not phrases_b:
+            return 0.0
+        
+        intersection = len(phrases_a.intersection(phrases_b))
+        union = len(phrases_a.union(phrases_b))
+        
+        return intersection / union if union > 0 else 0.0
+
+    def _calculate_temporal_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate temporal similarity between incidents"""
+        created_a = incident_a.get('created_at')
+        created_b = incident_b.get('created_at')
+        
+        if not created_a or not created_b:
+            return 0.0
+        
+        # Convert to datetime if needed
+        if isinstance(created_a, str):
+            created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
+        if isinstance(created_b, str):
+            created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
+        
+        time_diff = abs((created_a - created_b).total_seconds())
+        
+        # Calculate similarity based on time difference
+        # Incidents within 1 hour: high similarity
+        # Incidents within 24 hours: medium similarity
+        # Incidents within 7 days: low similarity
+        if time_diff <= 3600:  # 1 hour
+            return 1.0
+        elif time_diff <= 86400:  # 24 hours
+            return 0.7
+        elif time_diff <= 604800:  # 7 days
+            return 0.3
+        else:
+            return 0.0
+
+    def _calculate_service_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate service/component similarity"""
+        # Extract service/component information from text
+        text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
+        text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
+        
+        # Common service/component keywords
+        service_keywords = [
+            'api', 'service', 'database', 'server', 'application', 'website', 'mobile',
+            'frontend', 'backend', 'microservice', 'gateway', 'load balancer', 'cache',
+            'queue', 'message', 'notification', 'email', 'sms', 'payment', 'auth'
+        ]
+        
+        services_a = set()
+        services_b = set()
+        
+        for keyword in service_keywords:
+            if keyword in text_a:
+                services_a.add(keyword)
+            if keyword in text_b:
+                services_b.add(keyword)
+        
+        if not services_a or not services_b:
+            return 0.0
+        
+        intersection = len(services_a.intersection(services_b))
+        union = len(services_a.union(services_b))
+        
+        return intersection / union if union > 0 else 0.0
+
+    def _calculate_category_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate category similarity"""
+        category_a = incident_a.get('category', '')
+        category_b = incident_b.get('category', '')
+        
+        if not category_a or not category_b:
+            return 0.0
+        
+        if category_a == category_b:
+            return 1.0
+        
+        # Check for related categories
+        related_categories = {
+            'INFRASTRUCTURE': ['APPLICATION', 'SECURITY'],
+            'APPLICATION': ['INFRASTRUCTURE', 'USER_EXPERIENCE'],
+            'SECURITY': ['INFRASTRUCTURE', 'APPLICATION'],
+            'USER_EXPERIENCE': ['APPLICATION', 'DATA'],
+            'DATA': ['USER_EXPERIENCE', 'INTEGRATION'],
+            'INTEGRATION': ['DATA', 'APPLICATION']
+        }
+        
+        if category_b in related_categories.get(category_a, []):
+            return 0.5
+        
+        return 0.0
+
+    def _determine_correlation_type(self, incident_a: Dict, incident_b: Dict, 
+                                  text_similarity: float, temporal_similarity: float, 
+                                  service_similarity: float) -> str:
+        """Determine the type of correlation between incidents"""
+        
+        # Same service correlation
+        if service_similarity > 0.7:
+            return 'SAME_SERVICE'
+        
+        # Same component correlation
+        if text_similarity > 0.6 and service_similarity > 0.4:
+            return 'SAME_COMPONENT'
+        
+        # Temporal correlation
+        if temporal_similarity > 0.7 and text_similarity > 0.3:
+            return 'TEMPORAL'
+        
+        # Pattern match
+        if text_similarity > 0.5:
+            return 'PATTERN'
+        
+        # Dependency correlation
+        if service_similarity > 0.4 and temporal_similarity > 0.5:
+            return 'DEPENDENCY'
+        
+        # Cascade effect
+        if temporal_similarity > 0.8 and text_similarity > 0.4:
+            return 'CASCADE'
+        
+        return 'PATTERN'  # Default
+
+    def _calculate_confidence_score(self, overall_similarity: float, correlation_type: str, 
+                                  incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate confidence score for the correlation"""
+        base_confidence = overall_similarity
+        
+        # Adjust based on correlation type
+        type_adjustments = {
+            'SAME_SERVICE': 0.1,
+            'SAME_COMPONENT': 0.15,
+            'TEMPORAL': 0.05,
+            'PATTERN': 0.0,
+            'DEPENDENCY': 0.1,
+            'CASCADE': 0.2
+        }
+        
+        base_confidence += type_adjustments.get(correlation_type, 0.0)
+        
+        # Adjust based on incident characteristics
+        if incident_a.get('severity') == incident_b.get('severity'):
+            base_confidence += 0.05
+        
+        if incident_a.get('status') == incident_b.get('status'):
+            base_confidence += 0.03
+        
+        return min(base_confidence, 1.0)
+
+    def _determine_correlation_strength(self, confidence_score: float) -> str:
+        """Determine correlation strength based on confidence score"""
+        if confidence_score >= self.correlation_thresholds['VERY_STRONG']:
+            return 'VERY_STRONG'
+        elif confidence_score >= self.correlation_thresholds['STRONG']:
+            return 'STRONG'
+        elif confidence_score >= self.correlation_thresholds['MODERATE']:
+            return 'MODERATE'
+        else:
+            return 'WEAK'
+
+    def _extract_shared_keywords(self, incident_a: Dict, incident_b: Dict) -> List[str]:
+        """Extract keywords shared between incidents"""
+        text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
+        text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
+        
+        keywords_a = set(self.classifier._extract_keywords(text_a))
+        keywords_b = set(self.classifier._extract_keywords(text_b))
+        
+        shared = list(keywords_a.intersection(keywords_b))
+        return shared[:10]  # Return top 10 shared keywords
+
+    def _calculate_time_difference(self, incident_a: Dict, incident_b: Dict) -> timedelta:
+        """Calculate time difference between incidents"""
+        created_a = incident_a.get('created_at')
+        created_b = incident_b.get('created_at')
+        
+        if not created_a or not created_b:
+            return timedelta(0)
+        
+        # Convert to datetime if needed
+        if isinstance(created_a, str):
+            created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
+        if isinstance(created_b, str):
+            created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
+        
+        return abs(created_a - created_b)
+
+    def _detect_problem_patterns(self, incident_a: Dict, incident_b: Dict, 
+                               correlation_type: str, confidence_score: float) -> Tuple[bool, Optional[str]]:
+        """Detect if correlation indicates a larger problem"""
+        
+        # High confidence correlations are more likely to indicate problems
+        if confidence_score < 0.6:
+            return False, None
+        
+        # Check for specific problem patterns
+        text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
+        text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
+        combined_text = f"{text_a} {text_b}"
+        
+        for pattern_name, pattern_data in self.problem_patterns.items():
+            # Check for pattern keywords
+            keyword_matches = sum(1 for keyword in pattern_data['keywords'] if keyword in combined_text)
+            
+            if keyword_matches >= 2:  # At least 2 keywords match
+                return True, f"Potential {pattern_name.replace('_', ' ').lower()} detected"
+        
+        # Check for cascade effects
+        if correlation_type == 'CASCADE' and confidence_score > 0.7:
+            return True, "Potential cascade failure detected"
+        
+        # Check for recurring issues
+        if correlation_type == 'SAME_SERVICE' and confidence_score > 0.8:
+            return True, "Potential recurring service issue detected"
+        
+        return False, None
+
+    def find_related_incidents(self, target_incident: Dict, all_incidents: List[Dict], 
+                             limit: int = 10) -> List[Tuple[Dict, CorrelationResult]]:
+        """Find incidents related to a target incident"""
+        correlations = []
+        
+        for incident in all_incidents:
+            if incident['id'] == target_incident['id']:
+                continue
+            
+            correlation = self.correlate_incidents(target_incident, incident)
+            if correlation:
+                correlations.append((incident, correlation))
+        
+        # Sort by confidence score and return top results
+        correlations.sort(key=lambda x: x[1].confidence_score, reverse=True)
+        return correlations[:limit]
+
+    def detect_problem_clusters(self, incidents: List[Dict], 
+                              min_incidents: int = 3, 
+                              time_window: timedelta = timedelta(hours=24)) -> List[Dict]:
+        """Detect clusters of related incidents that might indicate larger problems"""
+        clusters = []
+        processed_incidents = set()
+        
+        for incident in incidents:
+            if incident['id'] in processed_incidents:
+                continue
+            
+            # Find related incidents within time window
+            related_incidents = []
+            incident_time = incident.get('created_at')
+            
+            if isinstance(incident_time, str):
+                incident_time = datetime.fromisoformat(incident_time.replace('Z', '+00:00'))
+            
+            for other_incident in incidents:
+                if other_incident['id'] == incident['id'] or other_incident['id'] in processed_incidents:
+                    continue
+                
+                other_time = other_incident.get('created_at')
+                if isinstance(other_time, str):
+                    other_time = datetime.fromisoformat(other_time.replace('Z', '+00:00'))
+                
+                # Check if within time window
+                if abs((incident_time - other_time).total_seconds()) <= time_window.total_seconds():
+                    correlation = self.correlate_incidents(incident, other_incident)
+                    if correlation and correlation.confidence_score > 0.5:
+                        related_incidents.append((other_incident, correlation))
+            
+            # If we found enough related incidents, create a cluster
+            if len(related_incidents) >= min_incidents - 1:  # -1 because we include the original incident
+                cluster = {
+                    'incidents': [incident] + [inc[0] for inc in related_incidents],
+                    'correlations': [inc[1] for inc in related_incidents],
+                    'problem_type': self._classify_problem_type(incident, related_incidents),
+                    'confidence': sum(inc[1].confidence_score for inc in related_incidents) / len(related_incidents),
+                    'time_span': self._calculate_cluster_time_span([incident] + [inc[0] for inc in related_incidents])
+                }
+                clusters.append(cluster)
+                
+                # Mark incidents as processed
+                processed_incidents.add(incident['id'])
+                for related_incident, _ in related_incidents:
+                    processed_incidents.add(related_incident['id'])
+        
+        return clusters
+
+    def _classify_problem_type(self, incident: Dict, related_incidents: List[Tuple[Dict, CorrelationResult]]) -> str:
+        """Classify the type of problem based on incident cluster"""
+        correlation_types = [corr.correlation_type for _, corr in related_incidents]
+        
+        if 'CASCADE' in correlation_types:
+            return 'CASCADE_FAILURE'
+        elif 'SAME_SERVICE' in correlation_types:
+            return 'SERVICE_OUTAGE'
+        elif 'TEMPORAL' in correlation_types:
+            return 'RECURRING_ISSUE'
+        else:
+            return 'PATTERN_BASED_PROBLEM'
+
+    def _calculate_cluster_time_span(self, incidents: List[Dict]) -> timedelta:
+        """Calculate the time span of a cluster of incidents"""
+        times = []
+        for incident in incidents:
+            created_at = incident.get('created_at')
+            if isinstance(created_at, str):
+                created_at = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
+            times.append(created_at)
+        
+        if len(times) < 2:
+            return timedelta(0)
+        
+        return max(times) - min(times)
--- a/ETB-API/incident_intelligence/ai/duplication.py
+++ b/ETB-API/incident_intelligence/ai/duplication.py
@@ -0,0 +1,516 @@
+"""
+Duplication detection engine for identifying and merging duplicate incidents
+"""
+import time
+from typing import Dict, List, Tuple, Optional
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from .classification import IncidentClassifier
+
+
+@dataclass
+class DuplicationResult:
+    """Result of duplication detection analysis"""
+    duplication_type: str
+    similarity_score: float
+    confidence_score: float
+    text_similarity: float
+    temporal_proximity: float
+    service_similarity: float
+    recommended_action: str
+    merge_confidence: float
+    reasoning: str
+    shared_elements: List[str]
+
+
+class DuplicationDetector:
+    """
+    AI-driven duplication detector for identifying duplicate incidents
+    """
+    
+    def __init__(self):
+        self.model_version = "v1.0"
+        self.classifier = IncidentClassifier()
+        
+        # Duplication thresholds
+        self.duplication_thresholds = {
+            'EXACT': 0.95,
+            'NEAR_DUPLICATE': 0.85,
+            'SIMILAR': 0.70,
+            'POTENTIAL_DUPLICATE': 0.50
+        }
+        
+        # Action thresholds
+        self.action_thresholds = {
+            'MERGE': 0.90,
+            'LINK': 0.75,
+            'REVIEW': 0.60,
+            'NO_ACTION': 0.0
+        }
+        
+        # Time windows for temporal proximity
+        self.time_windows = {
+            'EXACT': timedelta(minutes=30),
+            'NEAR_DUPLICATE': timedelta(hours=2),
+            'SIMILAR': timedelta(hours=24),
+            'POTENTIAL_DUPLICATE': timedelta(days=7)
+        }
+
+    def detect_duplication(self, incident_a: Dict, incident_b: Dict) -> Optional[DuplicationResult]:
+        """
+        Detect if two incidents are duplicates
+        """
+        # Calculate various similarity metrics
+        text_similarity = self._calculate_text_similarity(incident_a, incident_b)
+        temporal_proximity = self._calculate_temporal_proximity(incident_a, incident_b)
+        service_similarity = self._calculate_service_similarity(incident_a, incident_b)
+        metadata_similarity = self._calculate_metadata_similarity(incident_a, incident_b)
+        
+        # Calculate overall similarity score
+        overall_similarity = (
+            text_similarity * 0.5 +
+            temporal_proximity * 0.2 +
+            service_similarity * 0.2 +
+            metadata_similarity * 0.1
+        )
+        
+        # Determine duplication type
+        duplication_type = self._determine_duplication_type(overall_similarity, text_similarity, temporal_proximity)
+        
+        if duplication_type == 'NO_DUPLICATE':
+            return None
+        
+        # Calculate confidence score
+        confidence_score = self._calculate_confidence_score(
+            overall_similarity, text_similarity, temporal_proximity, service_similarity
+        )
+        
+        # Determine recommended action
+        recommended_action = self._determine_recommended_action(confidence_score, duplication_type)
+        
+        # Calculate merge confidence
+        merge_confidence = self._calculate_merge_confidence(
+            confidence_score, duplication_type, incident_a, incident_b
+        )
+        
+        # Generate reasoning
+        reasoning = self._generate_reasoning(
+            duplication_type, text_similarity, temporal_proximity, service_similarity
+        )
+        
+        # Extract shared elements
+        shared_elements = self._extract_shared_elements(incident_a, incident_b)
+        
+        return DuplicationResult(
+            duplication_type=duplication_type,
+            similarity_score=overall_similarity,
+            confidence_score=confidence_score,
+            text_similarity=text_similarity,
+            temporal_proximity=temporal_proximity,
+            service_similarity=service_similarity,
+            recommended_action=recommended_action,
+            merge_confidence=merge_confidence,
+            reasoning=reasoning,
+            shared_elements=shared_elements
+        )
+
+    def _calculate_text_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate text similarity between incidents"""
+        # Combine all text fields
+        text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')} {incident_a.get('free_text', '')}".lower()
+        text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')} {incident_b.get('free_text', '')}".lower()
+        
+        # Calculate multiple similarity metrics
+        jaccard_similarity = self._calculate_jaccard_similarity(text_a, text_b)
+        cosine_similarity = self._calculate_cosine_similarity(text_a, text_b)
+        phrase_similarity = self._calculate_phrase_similarity(text_a, text_b)
+        semantic_similarity = self._calculate_semantic_similarity(text_a, text_b)
+        
+        # Weighted combination
+        return (
+            jaccard_similarity * 0.3 +
+            cosine_similarity * 0.3 +
+            phrase_similarity * 0.2 +
+            semantic_similarity * 0.2
+        )
+
+    def _calculate_jaccard_similarity(self, text_a: str, text_b: str) -> float:
+        """Calculate Jaccard similarity based on word sets"""
+        words_a = set(text_a.split())
+        words_b = set(text_b.split())
+        
+        if not words_a or not words_b:
+            return 0.0
+        
+        intersection = len(words_a.intersection(words_b))
+        union = len(words_a.union(words_b))
+        
+        return intersection / union if union > 0 else 0.0
+
+    def _calculate_cosine_similarity(self, text_a: str, text_b: str) -> float:
+        """Calculate cosine similarity based on word frequency"""
+        from collections import Counter
+        
+        words_a = Counter(text_a.split())
+        words_b = Counter(text_b.split())
+        
+        # Get all unique words
+        all_words = set(words_a.keys()) | set(words_b.keys())
+        
+        if not all_words:
+            return 0.0
+        
+        # Create vectors
+        vector_a = [words_a.get(word, 0) for word in all_words]
+        vector_b = [words_b.get(word, 0) for word in all_words]
+        
+        # Calculate cosine similarity
+        dot_product = sum(a * b for a, b in zip(vector_a, vector_b))
+        magnitude_a = sum(a * a for a in vector_a) ** 0.5
+        magnitude_b = sum(b * b for b in vector_b) ** 0.5
+        
+        if magnitude_a == 0 or magnitude_b == 0:
+            return 0.0
+        
+        return dot_product / (magnitude_a * magnitude_b)
+
+    def _calculate_phrase_similarity(self, text_a: str, text_b: str) -> float:
+        """Calculate similarity based on common phrases"""
+        # Extract 2-3 word phrases
+        phrases_a = set()
+        phrases_b = set()
+        
+        words_a = text_a.split()
+        words_b = text_b.split()
+        
+        # Extract 2-word phrases
+        for i in range(len(words_a) - 1):
+            phrases_a.add(f"{words_a[i]} {words_a[i+1]}")
+        
+        for i in range(len(words_b) - 1):
+            phrases_b.add(f"{words_b[i]} {words_b[i+1]}")
+        
+        # Extract 3-word phrases
+        for i in range(len(words_a) - 2):
+            phrases_a.add(f"{words_a[i]} {words_a[i+1]} {words_a[i+2]}")
+        
+        for i in range(len(words_b) - 2):
+            phrases_b.add(f"{words_b[i]} {words_b[i+1]} {words_b[i+2]}")
+        
+        if not phrases_a or not phrases_b:
+            return 0.0
+        
+        intersection = len(phrases_a.intersection(phrases_b))
+        union = len(phrases_a.union(phrases_b))
+        
+        return intersection / union if union > 0 else 0.0
+
+    def _calculate_semantic_similarity(self, text_a: str, text_b: str) -> float:
+        """Calculate semantic similarity using keyword analysis"""
+        # Extract keywords using the classifier
+        keywords_a = set(self.classifier._extract_keywords(text_a))
+        keywords_b = set(self.classifier._extract_keywords(text_b))
+        
+        if not keywords_a or not keywords_b:
+            return 0.0
+        
+        # Calculate semantic similarity based on keyword overlap
+        intersection = len(keywords_a.intersection(keywords_b))
+        union = len(keywords_a.union(keywords_b))
+        
+        base_similarity = intersection / union if union > 0 else 0.0
+        
+        # Boost similarity for technical terms
+        technical_terms = {
+            'error', 'exception', 'timeout', 'connection', 'database', 'server',
+            'api', 'service', 'application', 'network', 'storage', 'memory',
+            'cpu', 'disk', 'bandwidth', 'latency', 'performance', 'crash'
+        }
+        
+        technical_intersection = len(keywords_a.intersection(keywords_b).intersection(technical_terms))
+        if technical_intersection > 0:
+            base_similarity += 0.1 * technical_intersection
+        
+        return min(base_similarity, 1.0)
+
+    def _calculate_temporal_proximity(self, incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate temporal proximity between incidents"""
+        created_a = incident_a.get('created_at')
+        created_b = incident_b.get('created_at')
+        
+        if not created_a or not created_b:
+            return 0.0
+        
+        # Convert to datetime if needed
+        if isinstance(created_a, str):
+            created_a = datetime.fromisoformat(created_a.replace('Z', '+00:00'))
+        if isinstance(created_b, str):
+            created_b = datetime.fromisoformat(created_b.replace('Z', '+00:00'))
+        
+        time_diff = abs((created_a - created_b).total_seconds())
+        
+        # Calculate proximity score based on time difference
+        if time_diff <= 300:  # 5 minutes
+            return 1.0
+        elif time_diff <= 1800:  # 30 minutes
+            return 0.9
+        elif time_diff <= 3600:  # 1 hour
+            return 0.7
+        elif time_diff <= 7200:  # 2 hours
+            return 0.5
+        elif time_diff <= 86400:  # 24 hours
+            return 0.3
+        elif time_diff <= 604800:  # 7 days
+            return 0.1
+        else:
+            return 0.0
+
+    def _calculate_service_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate service/component similarity"""
+        # Extract service information from text
+        text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
+        text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
+        
+        # Service/component keywords
+        service_keywords = [
+            'api', 'service', 'database', 'server', 'application', 'website', 'mobile',
+            'frontend', 'backend', 'microservice', 'gateway', 'load balancer', 'cache',
+            'queue', 'message', 'notification', 'email', 'sms', 'payment', 'auth',
+            'user service', 'order service', 'payment service', 'notification service'
+        ]
+        
+        services_a = set()
+        services_b = set()
+        
+        for keyword in service_keywords:
+            if keyword in text_a:
+                services_a.add(keyword)
+            if keyword in text_b:
+                services_b.add(keyword)
+        
+        if not services_a or not services_b:
+            return 0.0
+        
+        intersection = len(services_a.intersection(services_b))
+        union = len(services_a.union(services_b))
+        
+        return intersection / union if union > 0 else 0.0
+
+    def _calculate_metadata_similarity(self, incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate similarity based on metadata fields"""
+        similarity_score = 0.0
+        total_fields = 0
+        
+        # Compare severity
+        if incident_a.get('severity') == incident_b.get('severity'):
+            similarity_score += 1.0
+        total_fields += 1
+        
+        # Compare status
+        if incident_a.get('status') == incident_b.get('status'):
+            similarity_score += 1.0
+        total_fields += 1
+        
+        # Compare category
+        if incident_a.get('category') == incident_b.get('category'):
+            similarity_score += 1.0
+        total_fields += 1
+        
+        # Compare assigned user
+        if incident_a.get('assigned_to') == incident_b.get('assigned_to'):
+            similarity_score += 1.0
+        total_fields += 1
+        
+        # Compare reporter
+        if incident_a.get('reporter') == incident_b.get('reporter'):
+            similarity_score += 1.0
+        total_fields += 1
+        
+        return similarity_score / total_fields if total_fields > 0 else 0.0
+
+    def _determine_duplication_type(self, overall_similarity: float, text_similarity: float, 
+                                  temporal_proximity: float) -> str:
+        """Determine the type of duplication"""
+        if overall_similarity >= self.duplication_thresholds['EXACT']:
+            return 'EXACT'
+        elif overall_similarity >= self.duplication_thresholds['NEAR_DUPLICATE']:
+            return 'NEAR_DUPLICATE'
+        elif overall_similarity >= self.duplication_thresholds['SIMILAR']:
+            return 'SIMILAR'
+        elif overall_similarity >= self.duplication_thresholds['POTENTIAL_DUPLICATE']:
+            return 'POTENTIAL_DUPLICATE'
+        else:
+            return 'NO_DUPLICATE'
+
+    def _calculate_confidence_score(self, overall_similarity: float, text_similarity: float,
+                                  temporal_proximity: float, service_similarity: float) -> float:
+        """Calculate confidence score for duplication detection"""
+        base_confidence = overall_similarity
+        
+        # Boost confidence for high text similarity
+        if text_similarity > 0.8:
+            base_confidence += 0.1
+        
+        # Boost confidence for high temporal proximity
+        if temporal_proximity > 0.8:
+            base_confidence += 0.1
+        
+        # Boost confidence for high service similarity
+        if service_similarity > 0.8:
+            base_confidence += 0.05
+        
+        return min(base_confidence, 1.0)
+
+    def _determine_recommended_action(self, confidence_score: float, duplication_type: str) -> str:
+        """Determine recommended action based on confidence and duplication type"""
+        if confidence_score >= self.action_thresholds['MERGE']:
+            return 'MERGE'
+        elif confidence_score >= self.action_thresholds['LINK']:
+            return 'LINK'
+        elif confidence_score >= self.action_thresholds['REVIEW']:
+            return 'REVIEW'
+        else:
+            return 'NO_ACTION'
+
+    def _calculate_merge_confidence(self, confidence_score: float, duplication_type: str,
+                                  incident_a: Dict, incident_b: Dict) -> float:
+        """Calculate confidence for merging incidents"""
+        merge_confidence = confidence_score
+        
+        # Adjust based on duplication type
+        type_adjustments = {
+            'EXACT': 0.1,
+            'NEAR_DUPLICATE': 0.05,
+            'SIMILAR': 0.0,
+            'POTENTIAL_DUPLICATE': -0.1
+        }
+        
+        merge_confidence += type_adjustments.get(duplication_type, 0.0)
+        
+        # Adjust based on incident status
+        if incident_a.get('status') == incident_b.get('status'):
+            merge_confidence += 0.05
+        
+        # Adjust based on severity
+        if incident_a.get('severity') == incident_b.get('severity'):
+            merge_confidence += 0.03
+        
+        return min(max(merge_confidence, 0.0), 1.0)
+
+    def _generate_reasoning(self, duplication_type: str, text_similarity: float,
+                          temporal_proximity: float, service_similarity: float) -> str:
+        """Generate human-readable reasoning for duplication detection"""
+        reasoning_parts = []
+        
+        if text_similarity > 0.8:
+            reasoning_parts.append(f"Very high text similarity ({text_similarity:.1%})")
+        elif text_similarity > 0.6:
+            reasoning_parts.append(f"High text similarity ({text_similarity:.1%})")
+        elif text_similarity > 0.4:
+            reasoning_parts.append(f"Moderate text similarity ({text_similarity:.1%})")
+        
+        if temporal_proximity > 0.8:
+            reasoning_parts.append(f"Very close temporal proximity ({temporal_proximity:.1%})")
+        elif temporal_proximity > 0.6:
+            reasoning_parts.append(f"Close temporal proximity ({temporal_proximity:.1%})")
+        
+        if service_similarity > 0.8:
+            reasoning_parts.append(f"Very high service similarity ({service_similarity:.1%})")
+        elif service_similarity > 0.6:
+            reasoning_parts.append(f"High service similarity ({service_similarity:.1%})")
+        
+        if duplication_type == 'EXACT':
+            reasoning_parts.append("Incidents appear to be exact duplicates")
+        elif duplication_type == 'NEAR_DUPLICATE':
+            reasoning_parts.append("Incidents appear to be near duplicates")
+        elif duplication_type == 'SIMILAR':
+            reasoning_parts.append("Incidents appear to be similar")
+        elif duplication_type == 'POTENTIAL_DUPLICATE':
+            reasoning_parts.append("Incidents may be duplicates")
+        
+        return "; ".join(reasoning_parts) if reasoning_parts else "Based on overall similarity analysis"
+
+    def _extract_shared_elements(self, incident_a: Dict, incident_b: Dict) -> List[str]:
+        """Extract elements shared between incidents"""
+        shared_elements = []
+        
+        # Shared keywords
+        text_a = f"{incident_a.get('title', '')} {incident_a.get('description', '')}".lower()
+        text_b = f"{incident_b.get('title', '')} {incident_b.get('description', '')}".lower()
+        
+        keywords_a = set(self.classifier._extract_keywords(text_a))
+        keywords_b = set(self.classifier._extract_keywords(text_b))
+        shared_keywords = keywords_a.intersection(keywords_b)
+        
+        if shared_keywords:
+            shared_elements.append(f"Keywords: {', '.join(list(shared_keywords)[:5])}")
+        
+        # Shared services
+        service_keywords = [
+            'api', 'service', 'database', 'server', 'application', 'website', 'mobile'
+        ]
+        
+        services_a = set()
+        services_b = set()
+        
+        for keyword in service_keywords:
+            if keyword in text_a:
+                services_a.add(keyword)
+            if keyword in text_b:
+                services_b.add(keyword)
+        
+        shared_services = services_a.intersection(services_b)
+        if shared_services:
+            shared_elements.append(f"Services: {', '.join(shared_services)}")
+        
+        # Shared metadata
+        if incident_a.get('severity') == incident_b.get('severity'):
+            shared_elements.append(f"Severity: {incident_a.get('severity')}")
+        
+        if incident_a.get('category') == incident_b.get('category'):
+            shared_elements.append(f"Category: {incident_a.get('category')}")
+        
+        if incident_a.get('status') == incident_b.get('status'):
+            shared_elements.append(f"Status: {incident_a.get('status')}")
+        
+        return shared_elements
+
+    def find_duplicate_candidates(self, target_incident: Dict, all_incidents: List[Dict],
+                                limit: int = 10) -> List[Tuple[Dict, DuplicationResult]]:
+        """Find incidents that might be duplicates of the target incident"""
+        candidates = []
+        
+        for incident in all_incidents:
+            if incident['id'] == target_incident['id']:
+                continue
+            
+            duplication = self.detect_duplication(target_incident, incident)
+            if duplication:
+                candidates.append((incident, duplication))
+        
+        # Sort by confidence score and return top results
+        candidates.sort(key=lambda x: x[1].confidence_score, reverse=True)
+        return candidates[:limit]
+
+    def batch_detect_duplicates(self, incidents: List[Dict]) -> List[Tuple[Dict, Dict, DuplicationResult]]:
+        """Batch detect duplicates in a list of incidents"""
+        duplicates = []
+        processed_pairs = set()
+        
+        for i, incident_a in enumerate(incidents):
+            for j, incident_b in enumerate(incidents[i+1:], i+1):
+                # Create a unique pair identifier
+                pair_id = tuple(sorted([incident_a['id'], incident_b['id']]))
+                
+                if pair_id in processed_pairs:
+                    continue
+                
+                processed_pairs.add(pair_id)
+                
+                duplication = self.detect_duplication(incident_a, incident_b)
+                if duplication:
+                    duplicates.append((incident_a, incident_b, duplication))
+        
+        # Sort by confidence score
+        duplicates.sort(key=lambda x: x[2].confidence_score, reverse=True)
+        return duplicates