ETB/ETB-API/monitoring/management/commands/setup_monitoring.py

"""
Management command to set up initial monitoring configuration
"""
from django.core.management.base import BaseCommand
from django.contrib.auth import get_user_model
from monitoring.models import (
    MonitoringTarget, SystemMetric, AlertRule, MonitoringDashboard
)

User = get_user_model()


class Command(BaseCommand):
    help = 'Set up initial monitoring configuration'

    def add_arguments(self, parser):
        parser.add_argument(
            '--admin-user',
            type=str,
            help='Username of admin user to create monitoring objects',
            default='admin'
        )

    def handle(self, *args, **options):
        admin_username = options['admin_user']

        try:
            admin_user = User.objects.get(username=admin_username)
        except User.DoesNotExist:
            self.stdout.write(
                self.style.ERROR(f'Admin user "{admin_username}" not found')
            )
            return

        self.stdout.write('Setting up monitoring configuration...')

        # Create default monitoring targets
        self.create_default_targets(admin_user)

        # Create default metrics
        self.create_default_metrics(admin_user)

        # Create default alert rules
        self.create_default_alert_rules(admin_user)

        # Create default dashboards
        self.create_default_dashboards(admin_user)

        self.stdout.write(
            self.style.SUCCESS('Monitoring configuration setup completed!')
        )

    def create_default_targets(self, admin_user):
        """Create default monitoring targets"""
        self.stdout.write('Creating default monitoring targets...')

        targets = [
            {
                'name': 'Django Application',
                'description': 'Main Django application health check',
                'target_type': 'APPLICATION',
                'endpoint_url': 'http://localhost:8000/health/',
                'related_module': 'core',
                'health_check_enabled': True,
                'expected_status_codes': [200]
            },
            {
                'name': 'Database',
                'description': 'Database connection health check',
                'target_type': 'DATABASE',
                'related_module': 'core',
                'health_check_enabled': True
            },
            {
                'name': 'Cache System',
                'description': 'Cache system health check',
                'target_type': 'CACHE',
                'related_module': 'core',
                'health_check_enabled': True
            },
            {
                'name': 'Celery Workers',
                'description': 'Celery worker health check',
                'target_type': 'QUEUE',
                'related_module': 'core',
                'health_check_enabled': True
            },
            {
                'name': 'Security Module',
                'description': 'Security module health check',
                'target_type': 'MODULE',
                'related_module': 'security',
                'health_check_enabled': True
            },
            {
                'name': 'Incident Intelligence Module',
                'description': 'Incident Intelligence module health check',
                'target_type': 'MODULE',
                'related_module': 'incident_intelligence',
                'health_check_enabled': True
            },
            {
                'name': 'Automation Orchestration Module',
                'description': 'Automation Orchestration module health check',
                'target_type': 'MODULE',
                'related_module': 'automation_orchestration',
                'health_check_enabled': True
            },
            {
                'name': 'SLA OnCall Module',
                'description': 'SLA OnCall module health check',
                'target_type': 'MODULE',
                'related_module': 'sla_oncall',
                'health_check_enabled': True
            },
            {
                'name': 'Collaboration War Rooms Module',
                'description': 'Collaboration War Rooms module health check',
                'target_type': 'MODULE',
                'related_module': 'collaboration_war_rooms',
                'health_check_enabled': True
            },
            {
                'name': 'Compliance Governance Module',
                'description': 'Compliance Governance module health check',
                'target_type': 'MODULE',
                'related_module': 'compliance_governance',
                'health_check_enabled': True
            },
            {
                'name': 'Analytics Predictive Insights Module',
                'description': 'Analytics Predictive Insights module health check',
                'target_type': 'MODULE',
                'related_module': 'analytics_predictive_insights',
                'health_check_enabled': True
            },
            {
                'name': 'Knowledge Learning Module',
                'description': 'Knowledge Learning module health check',
                'target_type': 'MODULE',
                'related_module': 'knowledge_learning',
                'health_check_enabled': True
            }
        ]

        for target_data in targets:
            target, created = MonitoringTarget.objects.get_or_create(
                name=target_data['name'],
                defaults={
                    **target_data,
                    'created_by': admin_user
                }
            )
            if created:
                self.stdout.write(f'  Created target: {target.name}')
            else:
                self.stdout.write(f'  Target already exists: {target.name}')

    def create_default_metrics(self, admin_user):
        """Create default system metrics"""
        self.stdout.write('Creating default system metrics...')

        metrics = [
            {
                'name': 'API Response Time',
                'description': 'Average API response time in milliseconds',
                'metric_type': 'PERFORMANCE',
                'category': 'API_RESPONSE_TIME',
                'unit': 'milliseconds',
                'aggregation_method': 'AVERAGE',
                'collection_interval_seconds': 300,
                'warning_threshold': 1000,
                'critical_threshold': 2000,
                'is_system_metric': True
            },
            {
                'name': 'Request Throughput',
                'description': 'Number of requests per minute',
                'metric_type': 'PERFORMANCE',
                'category': 'THROUGHPUT',
                'unit': 'requests/minute',
                'aggregation_method': 'SUM',
                'collection_interval_seconds': 60,
                'warning_threshold': 1000,
                'critical_threshold': 2000,
                'is_system_metric': True
            },
            {
                'name': 'Error Rate',
                'description': 'Percentage of failed requests',
                'metric_type': 'PERFORMANCE',
                'category': 'ERROR_RATE',
                'unit': 'percentage',
                'aggregation_method': 'AVERAGE',
                'collection_interval_seconds': 300,
                'warning_threshold': 5.0,
                'critical_threshold': 10.0,
                'is_system_metric': True
            },
            {
                'name': 'System Availability',
                'description': 'System availability percentage',
                'metric_type': 'INFRASTRUCTURE',
                'category': 'AVAILABILITY',
                'unit': 'percentage',
                'aggregation_method': 'AVERAGE',
                'collection_interval_seconds': 300,
                'warning_threshold': 99.0,
                'critical_threshold': 95.0,
                'is_system_metric': True
            },
            {
                'name': 'Incident Count',
                'description': 'Number of incidents in the last 24 hours',
                'metric_type': 'BUSINESS',
                'category': 'INCIDENT_COUNT',
                'unit': 'count',
                'aggregation_method': 'COUNT',
                'collection_interval_seconds': 3600,
                'warning_threshold': 10,
                'critical_threshold': 20,
                'is_system_metric': True,
                'related_module': 'incident_intelligence'
            },
            {
                'name': 'Mean Time to Resolve',
                'description': 'Average time to resolve incidents in minutes',
                'metric_type': 'BUSINESS',
                'category': 'MTTR',
                'unit': 'minutes',
                'aggregation_method': 'AVERAGE',
                'collection_interval_seconds': 3600,
                'warning_threshold': 120,
                'critical_threshold': 240,
                'is_system_metric': True,
                'related_module': 'incident_intelligence'
            },
            {
                'name': 'Mean Time to Acknowledge',
                'description': 'Average time to acknowledge incidents in minutes',
                'metric_type': 'BUSINESS',
                'category': 'MTTA',
                'unit': 'minutes',
                'aggregation_method': 'AVERAGE',
                'collection_interval_seconds': 3600,
                'warning_threshold': 15,
                'critical_threshold': 30,
                'is_system_metric': True,
                'related_module': 'incident_intelligence'
            },
            {
                'name': 'SLA Compliance',
                'description': 'SLA compliance percentage',
                'metric_type': 'BUSINESS',
                'category': 'SLA_COMPLIANCE',
                'unit': 'percentage',
                'aggregation_method': 'AVERAGE',
                'collection_interval_seconds': 3600,
                'warning_threshold': 95.0,
                'critical_threshold': 90.0,
                'is_system_metric': True,
                'related_module': 'sla_oncall'
            },
            {
                'name': 'Security Events',
                'description': 'Number of security events in the last hour',
                'metric_type': 'SECURITY',
                'category': 'SECURITY_EVENTS',
                'unit': 'count',
                'aggregation_method': 'COUNT',
                'collection_interval_seconds': 3600,
                'warning_threshold': 5,
                'critical_threshold': 10,
                'is_system_metric': True,
                'related_module': 'security'
            },
            {
                'name': 'Automation Success Rate',
                'description': 'Percentage of successful automation executions',
                'metric_type': 'BUSINESS',
                'category': 'AUTOMATION_SUCCESS',
                'unit': 'percentage',
                'aggregation_method': 'AVERAGE',
                'collection_interval_seconds': 3600,
                'warning_threshold': 90.0,
                'critical_threshold': 80.0,
                'is_system_metric': True,
                'related_module': 'automation_orchestration'
            },
            {
                'name': 'AI Model Accuracy',
                'description': 'AI model accuracy percentage',
                'metric_type': 'BUSINESS',
                'category': 'AI_ACCURACY',
                'unit': 'percentage',
                'aggregation_method': 'AVERAGE',
                'collection_interval_seconds': 3600,
                'warning_threshold': 85.0,
                'critical_threshold': 75.0,
                'is_system_metric': True,
                'related_module': 'incident_intelligence'
            },
            {
                'name': 'Cost Impact',
                'description': 'Total cost impact in USD for the last 30 days',
                'metric_type': 'BUSINESS',
                'category': 'COST_IMPACT',
                'unit': 'USD',
                'aggregation_method': 'SUM',
                'collection_interval_seconds': 86400,
                'warning_threshold': 10000,
                'critical_threshold': 50000,
                'is_system_metric': True,
                'related_module': 'analytics_predictive_insights'
            },
            {
                'name': 'User Activity',
                'description': 'Number of active users in the last hour',
                'metric_type': 'BUSINESS',
                'category': 'USER_ACTIVITY',
                'unit': 'count',
                'aggregation_method': 'COUNT',
                'collection_interval_seconds': 3600,
                'warning_threshold': 50,
                'critical_threshold': 100,
                'is_system_metric': True
            },
            {
                'name': 'CPU Usage',
                'description': 'System CPU usage percentage',
                'metric_type': 'INFRASTRUCTURE',
                'category': 'SYSTEM_RESOURCES',
                'unit': 'percentage',
                'aggregation_method': 'AVERAGE',
                'collection_interval_seconds': 300,
                'warning_threshold': 80.0,
                'critical_threshold': 90.0,
                'is_system_metric': True
            }
        ]

        for metric_data in metrics:
            metric, created = SystemMetric.objects.get_or_create(
                name=metric_data['name'],
                defaults={
                    **metric_data,
                    'created_by': admin_user
                }
            )
            if created:
                self.stdout.write(f'  Created metric: {metric.name}')
            else:
                self.stdout.write(f'  Metric already exists: {metric.name}')

    def create_default_alert_rules(self, admin_user):
        """Create default alert rules"""
        self.stdout.write('Creating default alert rules...')

        # Get metrics for alert rules
        api_response_metric = SystemMetric.objects.filter(name='API Response Time').first()
        error_rate_metric = SystemMetric.objects.filter(name='Error Rate').first()
        availability_metric = SystemMetric.objects.filter(name='System Availability').first()
        incident_count_metric = SystemMetric.objects.filter(name='Incident Count').first()
        mttr_metric = SystemMetric.objects.filter(name='Mean Time to Resolve').first()
        security_events_metric = SystemMetric.objects.filter(name='Security Events').first()
        cpu_metric = SystemMetric.objects.filter(name='CPU Usage').first()

        alert_rules = [
            {
                'name': 'High API Response Time',
                'description': 'Alert when API response time exceeds threshold',
                'alert_type': 'THRESHOLD',
                'severity': 'HIGH',
                'condition': {
                    'type': 'THRESHOLD',
                    'operator': '>',
                    'threshold': 2000
                },
                'metric': api_response_metric,
                'notification_channels': [
                    {
                        'type': 'EMAIL',
                        'recipients': ['admin@example.com']
                    }
                ]
            },
            {
                'name': 'High Error Rate',
                'description': 'Alert when error rate exceeds threshold',
                'alert_type': 'THRESHOLD',
                'severity': 'CRITICAL',
                'condition': {
                    'type': 'THRESHOLD',
                    'operator': '>',
                    'threshold': 10.0
                },
                'metric': error_rate_metric,
                'notification_channels': [
                    {
                        'type': 'EMAIL',
                        'recipients': ['admin@example.com']
                    }
                ]
            },
            {
                'name': 'Low System Availability',
                'description': 'Alert when system availability drops below threshold',
                'alert_type': 'AVAILABILITY',
                'severity': 'CRITICAL',
                'condition': {
                    'type': 'THRESHOLD',
                    'operator': '<',
                    'threshold': 95.0
                },
                'metric': availability_metric,
                'notification_channels': [
                    {
                        'type': 'EMAIL',
                        'recipients': ['admin@example.com']
                    }
                ]
            },
            {
                'name': 'High Incident Count',
                'description': 'Alert when incident count exceeds threshold',
                'alert_type': 'THRESHOLD',
                'severity': 'HIGH',
                'condition': {
                    'type': 'THRESHOLD',
                    'operator': '>',
                    'threshold': 20
                },
                'metric': incident_count_metric,
                'notification_channels': [
                    {
                        'type': 'EMAIL',
                        'recipients': ['admin@example.com']
                    }
                ]
            },
            {
                'name': 'High MTTR',
                'description': 'Alert when mean time to resolve exceeds threshold',
                'alert_type': 'THRESHOLD',
                'severity': 'MEDIUM',
                'condition': {
                    'type': 'THRESHOLD',
                    'operator': '>',
                    'threshold': 240
                },
                'metric': mttr_metric,
                'notification_channels': [
                    {
                        'type': 'EMAIL',
                        'recipients': ['admin@example.com']
                    }
                ]
            },
            {
                'name': 'High Security Events',
                'description': 'Alert when security events exceed threshold',
                'alert_type': 'THRESHOLD',
                'severity': 'HIGH',
                'condition': {
                    'type': 'THRESHOLD',
                    'operator': '>',
                    'threshold': 10
                },
                'metric': security_events_metric,
                'notification_channels': [
                    {
                        'type': 'EMAIL',
                        'recipients': ['admin@example.com']
                    }
                ]
            },
            {
                'name': 'High CPU Usage',
                'description': 'Alert when CPU usage exceeds threshold',
                'alert_type': 'THRESHOLD',
                'severity': 'HIGH',
                'condition': {
                    'type': 'THRESHOLD',
                    'operator': '>',
                    'threshold': 90.0
                },
                'metric': cpu_metric,
                'notification_channels': [
                    {
                        'type': 'EMAIL',
                        'recipients': ['admin@example.com']
                    }
                ]
            }
        ]

        for rule_data in alert_rules:
            if rule_data['metric']:  # Only create if metric exists
                rule, created = AlertRule.objects.get_or_create(
                    name=rule_data['name'],
                    defaults={
                        **rule_data,
                        'created_by': admin_user
                    }
                )
                if created:
                    self.stdout.write(f'  Created alert rule: {rule.name}')
                else:
                    self.stdout.write(f'  Alert rule already exists: {rule.name}')

    def create_default_dashboards(self, admin_user):
        """Create default monitoring dashboards"""
        self.stdout.write('Creating default monitoring dashboards...')

        dashboards = [
            {
                'name': 'System Overview',
                'description': 'High-level system overview dashboard',
                'dashboard_type': 'SYSTEM_OVERVIEW',
                'is_public': True,
                'auto_refresh_enabled': True,
                'refresh_interval_seconds': 30,
                'layout_config': {
                    'columns': 3,
                    'rows': 4
                },
                'widget_configs': [
                    {
                        'type': 'system_status',
                        'position': {'x': 0, 'y': 0, 'width': 3, 'height': 1}
                    },
                    {
                        'type': 'health_summary',
                        'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'alert_summary',
                        'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'system_resources',
                        'position': {'x': 2, 'y': 1, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'recent_incidents',
                        'position': {'x': 0, 'y': 2, 'width': 2, 'height': 2}
                    },
                    {
                        'type': 'metric_trends',
                        'position': {'x': 2, 'y': 2, 'width': 1, 'height': 2}
                    }
                ]
            },
            {
                'name': 'Performance Dashboard',
                'description': 'System performance metrics dashboard',
                'dashboard_type': 'PERFORMANCE',
                'is_public': True,
                'auto_refresh_enabled': True,
                'refresh_interval_seconds': 60,
                'layout_config': {
                    'columns': 2,
                    'rows': 3
                },
                'widget_configs': [
                    {
                        'type': 'api_response_time',
                        'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'throughput',
                        'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'error_rate',
                        'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'availability',
                        'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'system_resources',
                        'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1}
                    }
                ]
            },
            {
                'name': 'Business Metrics Dashboard',
                'description': 'Business and operational metrics dashboard',
                'dashboard_type': 'BUSINESS_METRICS',
                'is_public': True,
                'auto_refresh_enabled': True,
                'refresh_interval_seconds': 300,
                'layout_config': {
                    'columns': 2,
                    'rows': 3
                },
                'widget_configs': [
                    {
                        'type': 'incident_count',
                        'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'mttr',
                        'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'mtta',
                        'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'sla_compliance',
                        'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'cost_impact',
                        'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1}
                    }
                ]
            },
            {
                'name': 'Security Dashboard',
                'description': 'Security monitoring dashboard',
                'dashboard_type': 'SECURITY',
                'is_public': False,
                'auto_refresh_enabled': True,
                'refresh_interval_seconds': 60,
                'layout_config': {
                    'columns': 2,
                    'rows': 2
                },
                'widget_configs': [
                    {
                        'type': 'security_events',
                        'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'failed_logins',
                        'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'risk_assessments',
                        'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
                    },
                    {
                        'type': 'device_posture',
                        'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
                    }
                ]
            }
        ]

        for dashboard_data in dashboards:
            dashboard, created = MonitoringDashboard.objects.get_or_create(
                name=dashboard_data['name'],
                defaults={
                    **dashboard_data,
                    'created_by': admin_user
                }
            )
            if created:
                self.stdout.write(f'  Created dashboard: {dashboard.name}')
            else:
                self.stdout.write(f'  Dashboard already exists: {dashboard.name}')