Updates

2025-09-19 11:58:53 +03:00
parent 306b20e24a
commit 6b247e5b9f
11423 changed files with 1500615 additions and 778 deletions
--- a/ETB-API/monitoring/management/init.py
+++ b/ETB-API/monitoring/management/init.py
@@ -0,0 +1 @@
+# Management commands for monitoring
--- a/ETB-API/monitoring/management/pycache/init.cpython-312.pyc
+++ b/ETB-API/monitoring/management/pycache/init.cpython-312.pyc
--- a/ETB-API/monitoring/management/commands/init.py
+++ b/ETB-API/monitoring/management/commands/init.py
@@ -0,0 +1 @@
+# Management commands
--- a/ETB-API/monitoring/management/commands/pycache/init.cpython-312.pyc
+++ b/ETB-API/monitoring/management/commands/pycache/init.cpython-312.pyc
--- a/ETB-API/monitoring/management/commands/pycache/setup_monitoring.cpython-312.pyc
+++ b/ETB-API/monitoring/management/commands/pycache/setup_monitoring.cpython-312.pyc
--- a/ETB-API/monitoring/management/commands/setup_monitoring.py
+++ b/ETB-API/monitoring/management/commands/setup_monitoring.py
@@ -0,0 +1,665 @@
+"""
+Management command to set up initial monitoring configuration
+"""
+from django.core.management.base import BaseCommand
+from django.contrib.auth import get_user_model
+from monitoring.models import (
+    MonitoringTarget, SystemMetric, AlertRule, MonitoringDashboard
+)
+
+User = get_user_model()
+
+
+class Command(BaseCommand):
+    help = 'Set up initial monitoring configuration'
+    
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--admin-user',
+            type=str,
+            help='Username of admin user to create monitoring objects',
+            default='admin'
+        )
+    
+    def handle(self, *args, **options):
+        admin_username = options['admin_user']
+        
+        try:
+            admin_user = User.objects.get(username=admin_username)
+        except User.DoesNotExist:
+            self.stdout.write(
+                self.style.ERROR(f'Admin user "{admin_username}" not found')
+            )
+            return
+        
+        self.stdout.write('Setting up monitoring configuration...')
+        
+        # Create default monitoring targets
+        self.create_default_targets(admin_user)
+        
+        # Create default metrics
+        self.create_default_metrics(admin_user)
+        
+        # Create default alert rules
+        self.create_default_alert_rules(admin_user)
+        
+        # Create default dashboards
+        self.create_default_dashboards(admin_user)
+        
+        self.stdout.write(
+            self.style.SUCCESS('Monitoring configuration setup completed!')
+        )
+    
+    def create_default_targets(self, admin_user):
+        """Create default monitoring targets"""
+        self.stdout.write('Creating default monitoring targets...')
+        
+        targets = [
+            {
+                'name': 'Django Application',
+                'description': 'Main Django application health check',
+                'target_type': 'APPLICATION',
+                'endpoint_url': 'http://localhost:8000/health/',
+                'related_module': 'core',
+                'health_check_enabled': True,
+                'expected_status_codes': [200]
+            },
+            {
+                'name': 'Database',
+                'description': 'Database connection health check',
+                'target_type': 'DATABASE',
+                'related_module': 'core',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'Cache System',
+                'description': 'Cache system health check',
+                'target_type': 'CACHE',
+                'related_module': 'core',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'Celery Workers',
+                'description': 'Celery worker health check',
+                'target_type': 'QUEUE',
+                'related_module': 'core',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'Security Module',
+                'description': 'Security module health check',
+                'target_type': 'MODULE',
+                'related_module': 'security',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'Incident Intelligence Module',
+                'description': 'Incident Intelligence module health check',
+                'target_type': 'MODULE',
+                'related_module': 'incident_intelligence',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'Automation Orchestration Module',
+                'description': 'Automation Orchestration module health check',
+                'target_type': 'MODULE',
+                'related_module': 'automation_orchestration',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'SLA OnCall Module',
+                'description': 'SLA OnCall module health check',
+                'target_type': 'MODULE',
+                'related_module': 'sla_oncall',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'Collaboration War Rooms Module',
+                'description': 'Collaboration War Rooms module health check',
+                'target_type': 'MODULE',
+                'related_module': 'collaboration_war_rooms',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'Compliance Governance Module',
+                'description': 'Compliance Governance module health check',
+                'target_type': 'MODULE',
+                'related_module': 'compliance_governance',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'Analytics Predictive Insights Module',
+                'description': 'Analytics Predictive Insights module health check',
+                'target_type': 'MODULE',
+                'related_module': 'analytics_predictive_insights',
+                'health_check_enabled': True
+            },
+            {
+                'name': 'Knowledge Learning Module',
+                'description': 'Knowledge Learning module health check',
+                'target_type': 'MODULE',
+                'related_module': 'knowledge_learning',
+                'health_check_enabled': True
+            }
+        ]
+        
+        for target_data in targets:
+            target, created = MonitoringTarget.objects.get_or_create(
+                name=target_data['name'],
+                defaults={
+                    **target_data,
+                    'created_by': admin_user
+                }
+            )
+            if created:
+                self.stdout.write(f'  Created target: {target.name}')
+            else:
+                self.stdout.write(f'  Target already exists: {target.name}')
+    
+    def create_default_metrics(self, admin_user):
+        """Create default system metrics"""
+        self.stdout.write('Creating default system metrics...')
+        
+        metrics = [
+            {
+                'name': 'API Response Time',
+                'description': 'Average API response time in milliseconds',
+                'metric_type': 'PERFORMANCE',
+                'category': 'API_RESPONSE_TIME',
+                'unit': 'milliseconds',
+                'aggregation_method': 'AVERAGE',
+                'collection_interval_seconds': 300,
+                'warning_threshold': 1000,
+                'critical_threshold': 2000,
+                'is_system_metric': True
+            },
+            {
+                'name': 'Request Throughput',
+                'description': 'Number of requests per minute',
+                'metric_type': 'PERFORMANCE',
+                'category': 'THROUGHPUT',
+                'unit': 'requests/minute',
+                'aggregation_method': 'SUM',
+                'collection_interval_seconds': 60,
+                'warning_threshold': 1000,
+                'critical_threshold': 2000,
+                'is_system_metric': True
+            },
+            {
+                'name': 'Error Rate',
+                'description': 'Percentage of failed requests',
+                'metric_type': 'PERFORMANCE',
+                'category': 'ERROR_RATE',
+                'unit': 'percentage',
+                'aggregation_method': 'AVERAGE',
+                'collection_interval_seconds': 300,
+                'warning_threshold': 5.0,
+                'critical_threshold': 10.0,
+                'is_system_metric': True
+            },
+            {
+                'name': 'System Availability',
+                'description': 'System availability percentage',
+                'metric_type': 'INFRASTRUCTURE',
+                'category': 'AVAILABILITY',
+                'unit': 'percentage',
+                'aggregation_method': 'AVERAGE',
+                'collection_interval_seconds': 300,
+                'warning_threshold': 99.0,
+                'critical_threshold': 95.0,
+                'is_system_metric': True
+            },
+            {
+                'name': 'Incident Count',
+                'description': 'Number of incidents in the last 24 hours',
+                'metric_type': 'BUSINESS',
+                'category': 'INCIDENT_COUNT',
+                'unit': 'count',
+                'aggregation_method': 'COUNT',
+                'collection_interval_seconds': 3600,
+                'warning_threshold': 10,
+                'critical_threshold': 20,
+                'is_system_metric': True,
+                'related_module': 'incident_intelligence'
+            },
+            {
+                'name': 'Mean Time to Resolve',
+                'description': 'Average time to resolve incidents in minutes',
+                'metric_type': 'BUSINESS',
+                'category': 'MTTR',
+                'unit': 'minutes',
+                'aggregation_method': 'AVERAGE',
+                'collection_interval_seconds': 3600,
+                'warning_threshold': 120,
+                'critical_threshold': 240,
+                'is_system_metric': True,
+                'related_module': 'incident_intelligence'
+            },
+            {
+                'name': 'Mean Time to Acknowledge',
+                'description': 'Average time to acknowledge incidents in minutes',
+                'metric_type': 'BUSINESS',
+                'category': 'MTTA',
+                'unit': 'minutes',
+                'aggregation_method': 'AVERAGE',
+                'collection_interval_seconds': 3600,
+                'warning_threshold': 15,
+                'critical_threshold': 30,
+                'is_system_metric': True,
+                'related_module': 'incident_intelligence'
+            },
+            {
+                'name': 'SLA Compliance',
+                'description': 'SLA compliance percentage',
+                'metric_type': 'BUSINESS',
+                'category': 'SLA_COMPLIANCE',
+                'unit': 'percentage',
+                'aggregation_method': 'AVERAGE',
+                'collection_interval_seconds': 3600,
+                'warning_threshold': 95.0,
+                'critical_threshold': 90.0,
+                'is_system_metric': True,
+                'related_module': 'sla_oncall'
+            },
+            {
+                'name': 'Security Events',
+                'description': 'Number of security events in the last hour',
+                'metric_type': 'SECURITY',
+                'category': 'SECURITY_EVENTS',
+                'unit': 'count',
+                'aggregation_method': 'COUNT',
+                'collection_interval_seconds': 3600,
+                'warning_threshold': 5,
+                'critical_threshold': 10,
+                'is_system_metric': True,
+                'related_module': 'security'
+            },
+            {
+                'name': 'Automation Success Rate',
+                'description': 'Percentage of successful automation executions',
+                'metric_type': 'BUSINESS',
+                'category': 'AUTOMATION_SUCCESS',
+                'unit': 'percentage',
+                'aggregation_method': 'AVERAGE',
+                'collection_interval_seconds': 3600,
+                'warning_threshold': 90.0,
+                'critical_threshold': 80.0,
+                'is_system_metric': True,
+                'related_module': 'automation_orchestration'
+            },
+            {
+                'name': 'AI Model Accuracy',
+                'description': 'AI model accuracy percentage',
+                'metric_type': 'BUSINESS',
+                'category': 'AI_ACCURACY',
+                'unit': 'percentage',
+                'aggregation_method': 'AVERAGE',
+                'collection_interval_seconds': 3600,
+                'warning_threshold': 85.0,
+                'critical_threshold': 75.0,
+                'is_system_metric': True,
+                'related_module': 'incident_intelligence'
+            },
+            {
+                'name': 'Cost Impact',
+                'description': 'Total cost impact in USD for the last 30 days',
+                'metric_type': 'BUSINESS',
+                'category': 'COST_IMPACT',
+                'unit': 'USD',
+                'aggregation_method': 'SUM',
+                'collection_interval_seconds': 86400,
+                'warning_threshold': 10000,
+                'critical_threshold': 50000,
+                'is_system_metric': True,
+                'related_module': 'analytics_predictive_insights'
+            },
+            {
+                'name': 'User Activity',
+                'description': 'Number of active users in the last hour',
+                'metric_type': 'BUSINESS',
+                'category': 'USER_ACTIVITY',
+                'unit': 'count',
+                'aggregation_method': 'COUNT',
+                'collection_interval_seconds': 3600,
+                'warning_threshold': 50,
+                'critical_threshold': 100,
+                'is_system_metric': True
+            },
+            {
+                'name': 'CPU Usage',
+                'description': 'System CPU usage percentage',
+                'metric_type': 'INFRASTRUCTURE',
+                'category': 'SYSTEM_RESOURCES',
+                'unit': 'percentage',
+                'aggregation_method': 'AVERAGE',
+                'collection_interval_seconds': 300,
+                'warning_threshold': 80.0,
+                'critical_threshold': 90.0,
+                'is_system_metric': True
+            }
+        ]
+        
+        for metric_data in metrics:
+            metric, created = SystemMetric.objects.get_or_create(
+                name=metric_data['name'],
+                defaults={
+                    **metric_data,
+                    'created_by': admin_user
+                }
+            )
+            if created:
+                self.stdout.write(f'  Created metric: {metric.name}')
+            else:
+                self.stdout.write(f'  Metric already exists: {metric.name}')
+    
+    def create_default_alert_rules(self, admin_user):
+        """Create default alert rules"""
+        self.stdout.write('Creating default alert rules...')
+        
+        # Get metrics for alert rules
+        api_response_metric = SystemMetric.objects.filter(name='API Response Time').first()
+        error_rate_metric = SystemMetric.objects.filter(name='Error Rate').first()
+        availability_metric = SystemMetric.objects.filter(name='System Availability').first()
+        incident_count_metric = SystemMetric.objects.filter(name='Incident Count').first()
+        mttr_metric = SystemMetric.objects.filter(name='Mean Time to Resolve').first()
+        security_events_metric = SystemMetric.objects.filter(name='Security Events').first()
+        cpu_metric = SystemMetric.objects.filter(name='CPU Usage').first()
+        
+        alert_rules = [
+            {
+                'name': 'High API Response Time',
+                'description': 'Alert when API response time exceeds threshold',
+                'alert_type': 'THRESHOLD',
+                'severity': 'HIGH',
+                'condition': {
+                    'type': 'THRESHOLD',
+                    'operator': '>',
+                    'threshold': 2000
+                },
+                'metric': api_response_metric,
+                'notification_channels': [
+                    {
+                        'type': 'EMAIL',
+                        'recipients': ['admin@example.com']
+                    }
+                ]
+            },
+            {
+                'name': 'High Error Rate',
+                'description': 'Alert when error rate exceeds threshold',
+                'alert_type': 'THRESHOLD',
+                'severity': 'CRITICAL',
+                'condition': {
+                    'type': 'THRESHOLD',
+                    'operator': '>',
+                    'threshold': 10.0
+                },
+                'metric': error_rate_metric,
+                'notification_channels': [
+                    {
+                        'type': 'EMAIL',
+                        'recipients': ['admin@example.com']
+                    }
+                ]
+            },
+            {
+                'name': 'Low System Availability',
+                'description': 'Alert when system availability drops below threshold',
+                'alert_type': 'AVAILABILITY',
+                'severity': 'CRITICAL',
+                'condition': {
+                    'type': 'THRESHOLD',
+                    'operator': '<',
+                    'threshold': 95.0
+                },
+                'metric': availability_metric,
+                'notification_channels': [
+                    {
+                        'type': 'EMAIL',
+                        'recipients': ['admin@example.com']
+                    }
+                ]
+            },
+            {
+                'name': 'High Incident Count',
+                'description': 'Alert when incident count exceeds threshold',
+                'alert_type': 'THRESHOLD',
+                'severity': 'HIGH',
+                'condition': {
+                    'type': 'THRESHOLD',
+                    'operator': '>',
+                    'threshold': 20
+                },
+                'metric': incident_count_metric,
+                'notification_channels': [
+                    {
+                        'type': 'EMAIL',
+                        'recipients': ['admin@example.com']
+                    }
+                ]
+            },
+            {
+                'name': 'High MTTR',
+                'description': 'Alert when mean time to resolve exceeds threshold',
+                'alert_type': 'THRESHOLD',
+                'severity': 'MEDIUM',
+                'condition': {
+                    'type': 'THRESHOLD',
+                    'operator': '>',
+                    'threshold': 240
+                },
+                'metric': mttr_metric,
+                'notification_channels': [
+                    {
+                        'type': 'EMAIL',
+                        'recipients': ['admin@example.com']
+                    }
+                ]
+            },
+            {
+                'name': 'High Security Events',
+                'description': 'Alert when security events exceed threshold',
+                'alert_type': 'THRESHOLD',
+                'severity': 'HIGH',
+                'condition': {
+                    'type': 'THRESHOLD',
+                    'operator': '>',
+                    'threshold': 10
+                },
+                'metric': security_events_metric,
+                'notification_channels': [
+                    {
+                        'type': 'EMAIL',
+                        'recipients': ['admin@example.com']
+                    }
+                ]
+            },
+            {
+                'name': 'High CPU Usage',
+                'description': 'Alert when CPU usage exceeds threshold',
+                'alert_type': 'THRESHOLD',
+                'severity': 'HIGH',
+                'condition': {
+                    'type': 'THRESHOLD',
+                    'operator': '>',
+                    'threshold': 90.0
+                },
+                'metric': cpu_metric,
+                'notification_channels': [
+                    {
+                        'type': 'EMAIL',
+                        'recipients': ['admin@example.com']
+                    }
+                ]
+            }
+        ]
+        
+        for rule_data in alert_rules:
+            if rule_data['metric']:  # Only create if metric exists
+                rule, created = AlertRule.objects.get_or_create(
+                    name=rule_data['name'],
+                    defaults={
+                        **rule_data,
+                        'created_by': admin_user
+                    }
+                )
+                if created:
+                    self.stdout.write(f'  Created alert rule: {rule.name}')
+                else:
+                    self.stdout.write(f'  Alert rule already exists: {rule.name}')
+    
+    def create_default_dashboards(self, admin_user):
+        """Create default monitoring dashboards"""
+        self.stdout.write('Creating default monitoring dashboards...')
+        
+        dashboards = [
+            {
+                'name': 'System Overview',
+                'description': 'High-level system overview dashboard',
+                'dashboard_type': 'SYSTEM_OVERVIEW',
+                'is_public': True,
+                'auto_refresh_enabled': True,
+                'refresh_interval_seconds': 30,
+                'layout_config': {
+                    'columns': 3,
+                    'rows': 4
+                },
+                'widget_configs': [
+                    {
+                        'type': 'system_status',
+                        'position': {'x': 0, 'y': 0, 'width': 3, 'height': 1}
+                    },
+                    {
+                        'type': 'health_summary',
+                        'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'alert_summary',
+                        'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'system_resources',
+                        'position': {'x': 2, 'y': 1, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'recent_incidents',
+                        'position': {'x': 0, 'y': 2, 'width': 2, 'height': 2}
+                    },
+                    {
+                        'type': 'metric_trends',
+                        'position': {'x': 2, 'y': 2, 'width': 1, 'height': 2}
+                    }
+                ]
+            },
+            {
+                'name': 'Performance Dashboard',
+                'description': 'System performance metrics dashboard',
+                'dashboard_type': 'PERFORMANCE',
+                'is_public': True,
+                'auto_refresh_enabled': True,
+                'refresh_interval_seconds': 60,
+                'layout_config': {
+                    'columns': 2,
+                    'rows': 3
+                },
+                'widget_configs': [
+                    {
+                        'type': 'api_response_time',
+                        'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'throughput',
+                        'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'error_rate',
+                        'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'availability',
+                        'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'system_resources',
+                        'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1}
+                    }
+                ]
+            },
+            {
+                'name': 'Business Metrics Dashboard',
+                'description': 'Business and operational metrics dashboard',
+                'dashboard_type': 'BUSINESS_METRICS',
+                'is_public': True,
+                'auto_refresh_enabled': True,
+                'refresh_interval_seconds': 300,
+                'layout_config': {
+                    'columns': 2,
+                    'rows': 3
+                },
+                'widget_configs': [
+                    {
+                        'type': 'incident_count',
+                        'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'mttr',
+                        'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'mtta',
+                        'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'sla_compliance',
+                        'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'cost_impact',
+                        'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1}
+                    }
+                ]
+            },
+            {
+                'name': 'Security Dashboard',
+                'description': 'Security monitoring dashboard',
+                'dashboard_type': 'SECURITY',
+                'is_public': False,
+                'auto_refresh_enabled': True,
+                'refresh_interval_seconds': 60,
+                'layout_config': {
+                    'columns': 2,
+                    'rows': 2
+                },
+                'widget_configs': [
+                    {
+                        'type': 'security_events',
+                        'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'failed_logins',
+                        'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'risk_assessments',
+                        'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
+                    },
+                    {
+                        'type': 'device_posture',
+                        'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
+                    }
+                ]
+            }
+        ]
+        
+        for dashboard_data in dashboards:
+            dashboard, created = MonitoringDashboard.objects.get_or_create(
+                name=dashboard_data['name'],
+                defaults={
+                    **dashboard_data,
+                    'created_by': admin_user
+                }
+            )
+            if created:
+                self.stdout.write(f'  Created dashboard: {dashboard.name}')
+            else:
+                self.stdout.write(f'  Dashboard already exists: {dashboard.name}')