""" Management command to set up initial monitoring configuration """ from django.core.management.base import BaseCommand from django.contrib.auth import get_user_model from monitoring.models import ( MonitoringTarget, SystemMetric, AlertRule, MonitoringDashboard ) User = get_user_model() class Command(BaseCommand): help = 'Set up initial monitoring configuration' def add_arguments(self, parser): parser.add_argument( '--admin-user', type=str, help='Username of admin user to create monitoring objects', default='admin' ) def handle(self, *args, **options): admin_username = options['admin_user'] try: admin_user = User.objects.get(username=admin_username) except User.DoesNotExist: self.stdout.write( self.style.ERROR(f'Admin user "{admin_username}" not found') ) return self.stdout.write('Setting up monitoring configuration...') # Create default monitoring targets self.create_default_targets(admin_user) # Create default metrics self.create_default_metrics(admin_user) # Create default alert rules self.create_default_alert_rules(admin_user) # Create default dashboards self.create_default_dashboards(admin_user) self.stdout.write( self.style.SUCCESS('Monitoring configuration setup completed!') ) def create_default_targets(self, admin_user): """Create default monitoring targets""" self.stdout.write('Creating default monitoring targets...') targets = [ { 'name': 'Django Application', 'description': 'Main Django application health check', 'target_type': 'APPLICATION', 'endpoint_url': 'http://localhost:8000/health/', 'related_module': 'core', 'health_check_enabled': True, 'expected_status_codes': [200] }, { 'name': 'Database', 'description': 'Database connection health check', 'target_type': 'DATABASE', 'related_module': 'core', 'health_check_enabled': True }, { 'name': 'Cache System', 'description': 'Cache system health check', 'target_type': 'CACHE', 'related_module': 'core', 'health_check_enabled': True }, { 'name': 'Celery Workers', 'description': 'Celery worker health check', 'target_type': 'QUEUE', 'related_module': 'core', 'health_check_enabled': True }, { 'name': 'Security Module', 'description': 'Security module health check', 'target_type': 'MODULE', 'related_module': 'security', 'health_check_enabled': True }, { 'name': 'Incident Intelligence Module', 'description': 'Incident Intelligence module health check', 'target_type': 'MODULE', 'related_module': 'incident_intelligence', 'health_check_enabled': True }, { 'name': 'Automation Orchestration Module', 'description': 'Automation Orchestration module health check', 'target_type': 'MODULE', 'related_module': 'automation_orchestration', 'health_check_enabled': True }, { 'name': 'SLA OnCall Module', 'description': 'SLA OnCall module health check', 'target_type': 'MODULE', 'related_module': 'sla_oncall', 'health_check_enabled': True }, { 'name': 'Collaboration War Rooms Module', 'description': 'Collaboration War Rooms module health check', 'target_type': 'MODULE', 'related_module': 'collaboration_war_rooms', 'health_check_enabled': True }, { 'name': 'Compliance Governance Module', 'description': 'Compliance Governance module health check', 'target_type': 'MODULE', 'related_module': 'compliance_governance', 'health_check_enabled': True }, { 'name': 'Analytics Predictive Insights Module', 'description': 'Analytics Predictive Insights module health check', 'target_type': 'MODULE', 'related_module': 'analytics_predictive_insights', 'health_check_enabled': True }, { 'name': 'Knowledge Learning Module', 'description': 'Knowledge Learning module health check', 'target_type': 'MODULE', 'related_module': 'knowledge_learning', 'health_check_enabled': True } ] for target_data in targets: target, created = MonitoringTarget.objects.get_or_create( name=target_data['name'], defaults={ **target_data, 'created_by': admin_user } ) if created: self.stdout.write(f' Created target: {target.name}') else: self.stdout.write(f' Target already exists: {target.name}') def create_default_metrics(self, admin_user): """Create default system metrics""" self.stdout.write('Creating default system metrics...') metrics = [ { 'name': 'API Response Time', 'description': 'Average API response time in milliseconds', 'metric_type': 'PERFORMANCE', 'category': 'API_RESPONSE_TIME', 'unit': 'milliseconds', 'aggregation_method': 'AVERAGE', 'collection_interval_seconds': 300, 'warning_threshold': 1000, 'critical_threshold': 2000, 'is_system_metric': True }, { 'name': 'Request Throughput', 'description': 'Number of requests per minute', 'metric_type': 'PERFORMANCE', 'category': 'THROUGHPUT', 'unit': 'requests/minute', 'aggregation_method': 'SUM', 'collection_interval_seconds': 60, 'warning_threshold': 1000, 'critical_threshold': 2000, 'is_system_metric': True }, { 'name': 'Error Rate', 'description': 'Percentage of failed requests', 'metric_type': 'PERFORMANCE', 'category': 'ERROR_RATE', 'unit': 'percentage', 'aggregation_method': 'AVERAGE', 'collection_interval_seconds': 300, 'warning_threshold': 5.0, 'critical_threshold': 10.0, 'is_system_metric': True }, { 'name': 'System Availability', 'description': 'System availability percentage', 'metric_type': 'INFRASTRUCTURE', 'category': 'AVAILABILITY', 'unit': 'percentage', 'aggregation_method': 'AVERAGE', 'collection_interval_seconds': 300, 'warning_threshold': 99.0, 'critical_threshold': 95.0, 'is_system_metric': True }, { 'name': 'Incident Count', 'description': 'Number of incidents in the last 24 hours', 'metric_type': 'BUSINESS', 'category': 'INCIDENT_COUNT', 'unit': 'count', 'aggregation_method': 'COUNT', 'collection_interval_seconds': 3600, 'warning_threshold': 10, 'critical_threshold': 20, 'is_system_metric': True, 'related_module': 'incident_intelligence' }, { 'name': 'Mean Time to Resolve', 'description': 'Average time to resolve incidents in minutes', 'metric_type': 'BUSINESS', 'category': 'MTTR', 'unit': 'minutes', 'aggregation_method': 'AVERAGE', 'collection_interval_seconds': 3600, 'warning_threshold': 120, 'critical_threshold': 240, 'is_system_metric': True, 'related_module': 'incident_intelligence' }, { 'name': 'Mean Time to Acknowledge', 'description': 'Average time to acknowledge incidents in minutes', 'metric_type': 'BUSINESS', 'category': 'MTTA', 'unit': 'minutes', 'aggregation_method': 'AVERAGE', 'collection_interval_seconds': 3600, 'warning_threshold': 15, 'critical_threshold': 30, 'is_system_metric': True, 'related_module': 'incident_intelligence' }, { 'name': 'SLA Compliance', 'description': 'SLA compliance percentage', 'metric_type': 'BUSINESS', 'category': 'SLA_COMPLIANCE', 'unit': 'percentage', 'aggregation_method': 'AVERAGE', 'collection_interval_seconds': 3600, 'warning_threshold': 95.0, 'critical_threshold': 90.0, 'is_system_metric': True, 'related_module': 'sla_oncall' }, { 'name': 'Security Events', 'description': 'Number of security events in the last hour', 'metric_type': 'SECURITY', 'category': 'SECURITY_EVENTS', 'unit': 'count', 'aggregation_method': 'COUNT', 'collection_interval_seconds': 3600, 'warning_threshold': 5, 'critical_threshold': 10, 'is_system_metric': True, 'related_module': 'security' }, { 'name': 'Automation Success Rate', 'description': 'Percentage of successful automation executions', 'metric_type': 'BUSINESS', 'category': 'AUTOMATION_SUCCESS', 'unit': 'percentage', 'aggregation_method': 'AVERAGE', 'collection_interval_seconds': 3600, 'warning_threshold': 90.0, 'critical_threshold': 80.0, 'is_system_metric': True, 'related_module': 'automation_orchestration' }, { 'name': 'AI Model Accuracy', 'description': 'AI model accuracy percentage', 'metric_type': 'BUSINESS', 'category': 'AI_ACCURACY', 'unit': 'percentage', 'aggregation_method': 'AVERAGE', 'collection_interval_seconds': 3600, 'warning_threshold': 85.0, 'critical_threshold': 75.0, 'is_system_metric': True, 'related_module': 'incident_intelligence' }, { 'name': 'Cost Impact', 'description': 'Total cost impact in USD for the last 30 days', 'metric_type': 'BUSINESS', 'category': 'COST_IMPACT', 'unit': 'USD', 'aggregation_method': 'SUM', 'collection_interval_seconds': 86400, 'warning_threshold': 10000, 'critical_threshold': 50000, 'is_system_metric': True, 'related_module': 'analytics_predictive_insights' }, { 'name': 'User Activity', 'description': 'Number of active users in the last hour', 'metric_type': 'BUSINESS', 'category': 'USER_ACTIVITY', 'unit': 'count', 'aggregation_method': 'COUNT', 'collection_interval_seconds': 3600, 'warning_threshold': 50, 'critical_threshold': 100, 'is_system_metric': True }, { 'name': 'CPU Usage', 'description': 'System CPU usage percentage', 'metric_type': 'INFRASTRUCTURE', 'category': 'SYSTEM_RESOURCES', 'unit': 'percentage', 'aggregation_method': 'AVERAGE', 'collection_interval_seconds': 300, 'warning_threshold': 80.0, 'critical_threshold': 90.0, 'is_system_metric': True } ] for metric_data in metrics: metric, created = SystemMetric.objects.get_or_create( name=metric_data['name'], defaults={ **metric_data, 'created_by': admin_user } ) if created: self.stdout.write(f' Created metric: {metric.name}') else: self.stdout.write(f' Metric already exists: {metric.name}') def create_default_alert_rules(self, admin_user): """Create default alert rules""" self.stdout.write('Creating default alert rules...') # Get metrics for alert rules api_response_metric = SystemMetric.objects.filter(name='API Response Time').first() error_rate_metric = SystemMetric.objects.filter(name='Error Rate').first() availability_metric = SystemMetric.objects.filter(name='System Availability').first() incident_count_metric = SystemMetric.objects.filter(name='Incident Count').first() mttr_metric = SystemMetric.objects.filter(name='Mean Time to Resolve').first() security_events_metric = SystemMetric.objects.filter(name='Security Events').first() cpu_metric = SystemMetric.objects.filter(name='CPU Usage').first() alert_rules = [ { 'name': 'High API Response Time', 'description': 'Alert when API response time exceeds threshold', 'alert_type': 'THRESHOLD', 'severity': 'HIGH', 'condition': { 'type': 'THRESHOLD', 'operator': '>', 'threshold': 2000 }, 'metric': api_response_metric, 'notification_channels': [ { 'type': 'EMAIL', 'recipients': ['admin@example.com'] } ] }, { 'name': 'High Error Rate', 'description': 'Alert when error rate exceeds threshold', 'alert_type': 'THRESHOLD', 'severity': 'CRITICAL', 'condition': { 'type': 'THRESHOLD', 'operator': '>', 'threshold': 10.0 }, 'metric': error_rate_metric, 'notification_channels': [ { 'type': 'EMAIL', 'recipients': ['admin@example.com'] } ] }, { 'name': 'Low System Availability', 'description': 'Alert when system availability drops below threshold', 'alert_type': 'AVAILABILITY', 'severity': 'CRITICAL', 'condition': { 'type': 'THRESHOLD', 'operator': '<', 'threshold': 95.0 }, 'metric': availability_metric, 'notification_channels': [ { 'type': 'EMAIL', 'recipients': ['admin@example.com'] } ] }, { 'name': 'High Incident Count', 'description': 'Alert when incident count exceeds threshold', 'alert_type': 'THRESHOLD', 'severity': 'HIGH', 'condition': { 'type': 'THRESHOLD', 'operator': '>', 'threshold': 20 }, 'metric': incident_count_metric, 'notification_channels': [ { 'type': 'EMAIL', 'recipients': ['admin@example.com'] } ] }, { 'name': 'High MTTR', 'description': 'Alert when mean time to resolve exceeds threshold', 'alert_type': 'THRESHOLD', 'severity': 'MEDIUM', 'condition': { 'type': 'THRESHOLD', 'operator': '>', 'threshold': 240 }, 'metric': mttr_metric, 'notification_channels': [ { 'type': 'EMAIL', 'recipients': ['admin@example.com'] } ] }, { 'name': 'High Security Events', 'description': 'Alert when security events exceed threshold', 'alert_type': 'THRESHOLD', 'severity': 'HIGH', 'condition': { 'type': 'THRESHOLD', 'operator': '>', 'threshold': 10 }, 'metric': security_events_metric, 'notification_channels': [ { 'type': 'EMAIL', 'recipients': ['admin@example.com'] } ] }, { 'name': 'High CPU Usage', 'description': 'Alert when CPU usage exceeds threshold', 'alert_type': 'THRESHOLD', 'severity': 'HIGH', 'condition': { 'type': 'THRESHOLD', 'operator': '>', 'threshold': 90.0 }, 'metric': cpu_metric, 'notification_channels': [ { 'type': 'EMAIL', 'recipients': ['admin@example.com'] } ] } ] for rule_data in alert_rules: if rule_data['metric']: # Only create if metric exists rule, created = AlertRule.objects.get_or_create( name=rule_data['name'], defaults={ **rule_data, 'created_by': admin_user } ) if created: self.stdout.write(f' Created alert rule: {rule.name}') else: self.stdout.write(f' Alert rule already exists: {rule.name}') def create_default_dashboards(self, admin_user): """Create default monitoring dashboards""" self.stdout.write('Creating default monitoring dashboards...') dashboards = [ { 'name': 'System Overview', 'description': 'High-level system overview dashboard', 'dashboard_type': 'SYSTEM_OVERVIEW', 'is_public': True, 'auto_refresh_enabled': True, 'refresh_interval_seconds': 30, 'layout_config': { 'columns': 3, 'rows': 4 }, 'widget_configs': [ { 'type': 'system_status', 'position': {'x': 0, 'y': 0, 'width': 3, 'height': 1} }, { 'type': 'health_summary', 'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1} }, { 'type': 'alert_summary', 'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1} }, { 'type': 'system_resources', 'position': {'x': 2, 'y': 1, 'width': 1, 'height': 1} }, { 'type': 'recent_incidents', 'position': {'x': 0, 'y': 2, 'width': 2, 'height': 2} }, { 'type': 'metric_trends', 'position': {'x': 2, 'y': 2, 'width': 1, 'height': 2} } ] }, { 'name': 'Performance Dashboard', 'description': 'System performance metrics dashboard', 'dashboard_type': 'PERFORMANCE', 'is_public': True, 'auto_refresh_enabled': True, 'refresh_interval_seconds': 60, 'layout_config': { 'columns': 2, 'rows': 3 }, 'widget_configs': [ { 'type': 'api_response_time', 'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1} }, { 'type': 'throughput', 'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1} }, { 'type': 'error_rate', 'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1} }, { 'type': 'availability', 'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1} }, { 'type': 'system_resources', 'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1} } ] }, { 'name': 'Business Metrics Dashboard', 'description': 'Business and operational metrics dashboard', 'dashboard_type': 'BUSINESS_METRICS', 'is_public': True, 'auto_refresh_enabled': True, 'refresh_interval_seconds': 300, 'layout_config': { 'columns': 2, 'rows': 3 }, 'widget_configs': [ { 'type': 'incident_count', 'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1} }, { 'type': 'mttr', 'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1} }, { 'type': 'mtta', 'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1} }, { 'type': 'sla_compliance', 'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1} }, { 'type': 'cost_impact', 'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1} } ] }, { 'name': 'Security Dashboard', 'description': 'Security monitoring dashboard', 'dashboard_type': 'SECURITY', 'is_public': False, 'auto_refresh_enabled': True, 'refresh_interval_seconds': 60, 'layout_config': { 'columns': 2, 'rows': 2 }, 'widget_configs': [ { 'type': 'security_events', 'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1} }, { 'type': 'failed_logins', 'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1} }, { 'type': 'risk_assessments', 'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1} }, { 'type': 'device_posture', 'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1} } ] } ] for dashboard_data in dashboards: dashboard, created = MonitoringDashboard.objects.get_or_create( name=dashboard_data['name'], defaults={ **dashboard_data, 'created_by': admin_user } ) if created: self.stdout.write(f' Created dashboard: {dashboard.name}') else: self.stdout.write(f' Dashboard already exists: {dashboard.name}')