Files
ETB/ETB-API/incident_intelligence/management/commands/setup_incident_intelligence.py
Iliyan Angelov 6b247e5b9f Updates
2025-09-19 11:58:53 +03:00

214 lines
9.3 KiB
Python

"""
Management command to set up incident intelligence module
"""
from django.core.management.base import BaseCommand
from django.db import transaction
from django.contrib.auth import get_user_model
from incident_intelligence.models import Incident, IncidentPattern
User = get_user_model()
class Command(BaseCommand):
help = 'Set up the incident intelligence module with sample data and configurations'
def add_arguments(self, parser):
parser.add_argument(
'--create-sample-data',
action='store_true',
help='Create sample incidents for testing',
)
parser.add_argument(
'--create-patterns',
action='store_true',
help='Create sample patterns',
)
parser.add_argument(
'--run-ai-analysis',
action='store_true',
help='Run AI analysis on existing incidents',
)
def handle(self, *args, **options):
self.stdout.write(
self.style.SUCCESS('Setting up Incident Intelligence module...')
)
if options['create_sample_data']:
self.create_sample_data()
if options['create_patterns']:
self.create_sample_patterns()
if options['run_ai_analysis']:
self.run_ai_analysis()
self.stdout.write(
self.style.SUCCESS('Incident Intelligence module setup completed!')
)
def create_sample_data(self):
"""Create sample incidents for testing"""
self.stdout.write('Creating sample incidents...')
sample_incidents = [
{
'title': 'Database Connection Timeout',
'description': 'Users are experiencing timeouts when trying to access the database. The issue started around 2 PM and affects all users.',
'free_text': 'Database is down, can\'t connect, getting timeout errors',
'severity': 'HIGH',
'affected_users': 150,
'business_impact': 'Critical business operations are affected. Users cannot access their data.',
},
{
'title': 'API Response Slow',
'description': 'The user service API is responding slowly, causing delays in user authentication and profile updates.',
'free_text': 'API is slow, taking forever to respond, users complaining',
'severity': 'MEDIUM',
'affected_users': 50,
'business_impact': 'User experience is degraded but core functionality still works.',
},
{
'title': 'Payment Gateway Error',
'description': 'Payment processing is failing with 500 errors. Customers cannot complete purchases.',
'free_text': 'Payment not working, getting errors, customers can\'t buy',
'severity': 'CRITICAL',
'affected_users': 200,
'business_impact': 'Revenue is directly impacted. Customers cannot make purchases.',
},
{
'title': 'Email Service Down',
'description': 'Email notifications are not being sent. Users are not receiving order confirmations and password reset emails.',
'free_text': 'Emails not sending, notifications broken, users not getting emails',
'severity': 'MEDIUM',
'affected_users': 75,
'business_impact': 'Communication with customers is disrupted.',
},
{
'title': 'Mobile App Crash',
'description': 'The mobile application is crashing on iOS devices when users try to view their order history.',
'free_text': 'App crashing on iPhone, can\'t see orders, keeps closing',
'severity': 'HIGH',
'affected_users': 100,
'business_impact': 'Mobile users cannot access their order information.',
},
{
'title': 'Database Connection Timeout',
'description': 'Users are experiencing timeouts when trying to access the database. The issue started around 3 PM and affects all users.',
'free_text': 'Database is down, can\'t connect, getting timeout errors',
'severity': 'HIGH',
'affected_users': 150,
'business_impact': 'Critical business operations are affected. Users cannot access their data.',
},
{
'title': 'Load Balancer Issue',
'description': 'The load balancer is not distributing traffic evenly, causing some servers to be overloaded.',
'free_text': 'Load balancer not working properly, servers overloaded',
'severity': 'HIGH',
'affected_users': 300,
'business_impact': 'System performance is degraded across multiple services.',
},
{
'title': 'Cache Miss Rate High',
'description': 'Redis cache is experiencing high miss rates, causing increased database load.',
'free_text': 'Cache not working, database overloaded, slow responses',
'severity': 'MEDIUM',
'affected_users': 0,
'business_impact': 'System performance is degraded but not directly visible to users.',
}
]
with transaction.atomic():
for incident_data in sample_incidents:
incident, created = Incident.objects.get_or_create(
title=incident_data['title'],
defaults=incident_data
)
if created:
self.stdout.write(f' Created incident: {incident.title}')
else:
self.stdout.write(f' Incident already exists: {incident.title}')
self.stdout.write(
self.style.SUCCESS(f'Created {len(sample_incidents)} sample incidents')
)
def create_sample_patterns(self):
"""Create sample patterns"""
self.stdout.write('Creating sample patterns...')
sample_patterns = [
{
'name': 'Database Connectivity Issues',
'pattern_type': 'RECURRING',
'description': 'Recurring database connection problems affecting multiple services',
'frequency': 'Weekly',
'affected_services': ['user-service', 'order-service', 'payment-service'],
'common_keywords': ['database', 'connection', 'timeout', 'error'],
'confidence_score': 0.85,
'is_active': True,
'is_resolved': False
},
{
'name': 'API Performance Degradation',
'pattern_type': 'TREND',
'description': 'Gradual degradation in API response times across services',
'frequency': 'Daily',
'affected_services': ['api-gateway', 'user-service', 'order-service'],
'common_keywords': ['slow', 'performance', 'latency', 'timeout'],
'confidence_score': 0.75,
'is_active': True,
'is_resolved': False
},
{
'name': 'Mobile App Crashes',
'pattern_type': 'RECURRING',
'description': 'Frequent crashes in mobile applications, particularly on iOS',
'frequency': 'Bi-weekly',
'affected_services': ['mobile-app', 'ios-app'],
'common_keywords': ['crash', 'mobile', 'ios', 'app'],
'confidence_score': 0.90,
'is_active': True,
'is_resolved': False
}
]
with transaction.atomic():
for pattern_data in sample_patterns:
pattern, created = IncidentPattern.objects.get_or_create(
name=pattern_data['name'],
defaults=pattern_data
)
if created:
self.stdout.write(f' Created pattern: {pattern.name}')
else:
self.stdout.write(f' Pattern already exists: {pattern.name}')
self.stdout.write(
self.style.SUCCESS(f'Created {len(sample_patterns)} sample patterns')
)
def run_ai_analysis(self):
"""Run AI analysis on existing incidents"""
self.stdout.write('Running AI analysis on existing incidents...')
try:
from incident_intelligence.tasks import batch_process_incidents_ai
# Get incidents that haven't been processed
unprocessed_incidents = Incident.objects.filter(ai_processed=False)
incident_ids = [str(incident.id) for incident in unprocessed_incidents]
if incident_ids:
batch_process_incidents_ai.delay(incident_ids)
self.stdout.write(
self.style.SUCCESS(f'Queued {len(incident_ids)} incidents for AI analysis')
)
else:
self.stdout.write('No unprocessed incidents found')
except Exception as e:
self.stdout.write(
self.style.ERROR(f'Failed to run AI analysis: {e}')
)