666 lines
26 KiB
Python
666 lines
26 KiB
Python
"""
|
|
Management command to set up initial monitoring configuration
|
|
"""
|
|
from django.core.management.base import BaseCommand
|
|
from django.contrib.auth import get_user_model
|
|
from monitoring.models import (
|
|
MonitoringTarget, SystemMetric, AlertRule, MonitoringDashboard
|
|
)
|
|
|
|
User = get_user_model()
|
|
|
|
|
|
class Command(BaseCommand):
|
|
help = 'Set up initial monitoring configuration'
|
|
|
|
def add_arguments(self, parser):
|
|
parser.add_argument(
|
|
'--admin-user',
|
|
type=str,
|
|
help='Username of admin user to create monitoring objects',
|
|
default='admin'
|
|
)
|
|
|
|
def handle(self, *args, **options):
|
|
admin_username = options['admin_user']
|
|
|
|
try:
|
|
admin_user = User.objects.get(username=admin_username)
|
|
except User.DoesNotExist:
|
|
self.stdout.write(
|
|
self.style.ERROR(f'Admin user "{admin_username}" not found')
|
|
)
|
|
return
|
|
|
|
self.stdout.write('Setting up monitoring configuration...')
|
|
|
|
# Create default monitoring targets
|
|
self.create_default_targets(admin_user)
|
|
|
|
# Create default metrics
|
|
self.create_default_metrics(admin_user)
|
|
|
|
# Create default alert rules
|
|
self.create_default_alert_rules(admin_user)
|
|
|
|
# Create default dashboards
|
|
self.create_default_dashboards(admin_user)
|
|
|
|
self.stdout.write(
|
|
self.style.SUCCESS('Monitoring configuration setup completed!')
|
|
)
|
|
|
|
def create_default_targets(self, admin_user):
|
|
"""Create default monitoring targets"""
|
|
self.stdout.write('Creating default monitoring targets...')
|
|
|
|
targets = [
|
|
{
|
|
'name': 'Django Application',
|
|
'description': 'Main Django application health check',
|
|
'target_type': 'APPLICATION',
|
|
'endpoint_url': 'http://localhost:8000/health/',
|
|
'related_module': 'core',
|
|
'health_check_enabled': True,
|
|
'expected_status_codes': [200]
|
|
},
|
|
{
|
|
'name': 'Database',
|
|
'description': 'Database connection health check',
|
|
'target_type': 'DATABASE',
|
|
'related_module': 'core',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'Cache System',
|
|
'description': 'Cache system health check',
|
|
'target_type': 'CACHE',
|
|
'related_module': 'core',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'Celery Workers',
|
|
'description': 'Celery worker health check',
|
|
'target_type': 'QUEUE',
|
|
'related_module': 'core',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'Security Module',
|
|
'description': 'Security module health check',
|
|
'target_type': 'MODULE',
|
|
'related_module': 'security',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'Incident Intelligence Module',
|
|
'description': 'Incident Intelligence module health check',
|
|
'target_type': 'MODULE',
|
|
'related_module': 'incident_intelligence',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'Automation Orchestration Module',
|
|
'description': 'Automation Orchestration module health check',
|
|
'target_type': 'MODULE',
|
|
'related_module': 'automation_orchestration',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'SLA OnCall Module',
|
|
'description': 'SLA OnCall module health check',
|
|
'target_type': 'MODULE',
|
|
'related_module': 'sla_oncall',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'Collaboration War Rooms Module',
|
|
'description': 'Collaboration War Rooms module health check',
|
|
'target_type': 'MODULE',
|
|
'related_module': 'collaboration_war_rooms',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'Compliance Governance Module',
|
|
'description': 'Compliance Governance module health check',
|
|
'target_type': 'MODULE',
|
|
'related_module': 'compliance_governance',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'Analytics Predictive Insights Module',
|
|
'description': 'Analytics Predictive Insights module health check',
|
|
'target_type': 'MODULE',
|
|
'related_module': 'analytics_predictive_insights',
|
|
'health_check_enabled': True
|
|
},
|
|
{
|
|
'name': 'Knowledge Learning Module',
|
|
'description': 'Knowledge Learning module health check',
|
|
'target_type': 'MODULE',
|
|
'related_module': 'knowledge_learning',
|
|
'health_check_enabled': True
|
|
}
|
|
]
|
|
|
|
for target_data in targets:
|
|
target, created = MonitoringTarget.objects.get_or_create(
|
|
name=target_data['name'],
|
|
defaults={
|
|
**target_data,
|
|
'created_by': admin_user
|
|
}
|
|
)
|
|
if created:
|
|
self.stdout.write(f' Created target: {target.name}')
|
|
else:
|
|
self.stdout.write(f' Target already exists: {target.name}')
|
|
|
|
def create_default_metrics(self, admin_user):
|
|
"""Create default system metrics"""
|
|
self.stdout.write('Creating default system metrics...')
|
|
|
|
metrics = [
|
|
{
|
|
'name': 'API Response Time',
|
|
'description': 'Average API response time in milliseconds',
|
|
'metric_type': 'PERFORMANCE',
|
|
'category': 'API_RESPONSE_TIME',
|
|
'unit': 'milliseconds',
|
|
'aggregation_method': 'AVERAGE',
|
|
'collection_interval_seconds': 300,
|
|
'warning_threshold': 1000,
|
|
'critical_threshold': 2000,
|
|
'is_system_metric': True
|
|
},
|
|
{
|
|
'name': 'Request Throughput',
|
|
'description': 'Number of requests per minute',
|
|
'metric_type': 'PERFORMANCE',
|
|
'category': 'THROUGHPUT',
|
|
'unit': 'requests/minute',
|
|
'aggregation_method': 'SUM',
|
|
'collection_interval_seconds': 60,
|
|
'warning_threshold': 1000,
|
|
'critical_threshold': 2000,
|
|
'is_system_metric': True
|
|
},
|
|
{
|
|
'name': 'Error Rate',
|
|
'description': 'Percentage of failed requests',
|
|
'metric_type': 'PERFORMANCE',
|
|
'category': 'ERROR_RATE',
|
|
'unit': 'percentage',
|
|
'aggregation_method': 'AVERAGE',
|
|
'collection_interval_seconds': 300,
|
|
'warning_threshold': 5.0,
|
|
'critical_threshold': 10.0,
|
|
'is_system_metric': True
|
|
},
|
|
{
|
|
'name': 'System Availability',
|
|
'description': 'System availability percentage',
|
|
'metric_type': 'INFRASTRUCTURE',
|
|
'category': 'AVAILABILITY',
|
|
'unit': 'percentage',
|
|
'aggregation_method': 'AVERAGE',
|
|
'collection_interval_seconds': 300,
|
|
'warning_threshold': 99.0,
|
|
'critical_threshold': 95.0,
|
|
'is_system_metric': True
|
|
},
|
|
{
|
|
'name': 'Incident Count',
|
|
'description': 'Number of incidents in the last 24 hours',
|
|
'metric_type': 'BUSINESS',
|
|
'category': 'INCIDENT_COUNT',
|
|
'unit': 'count',
|
|
'aggregation_method': 'COUNT',
|
|
'collection_interval_seconds': 3600,
|
|
'warning_threshold': 10,
|
|
'critical_threshold': 20,
|
|
'is_system_metric': True,
|
|
'related_module': 'incident_intelligence'
|
|
},
|
|
{
|
|
'name': 'Mean Time to Resolve',
|
|
'description': 'Average time to resolve incidents in minutes',
|
|
'metric_type': 'BUSINESS',
|
|
'category': 'MTTR',
|
|
'unit': 'minutes',
|
|
'aggregation_method': 'AVERAGE',
|
|
'collection_interval_seconds': 3600,
|
|
'warning_threshold': 120,
|
|
'critical_threshold': 240,
|
|
'is_system_metric': True,
|
|
'related_module': 'incident_intelligence'
|
|
},
|
|
{
|
|
'name': 'Mean Time to Acknowledge',
|
|
'description': 'Average time to acknowledge incidents in minutes',
|
|
'metric_type': 'BUSINESS',
|
|
'category': 'MTTA',
|
|
'unit': 'minutes',
|
|
'aggregation_method': 'AVERAGE',
|
|
'collection_interval_seconds': 3600,
|
|
'warning_threshold': 15,
|
|
'critical_threshold': 30,
|
|
'is_system_metric': True,
|
|
'related_module': 'incident_intelligence'
|
|
},
|
|
{
|
|
'name': 'SLA Compliance',
|
|
'description': 'SLA compliance percentage',
|
|
'metric_type': 'BUSINESS',
|
|
'category': 'SLA_COMPLIANCE',
|
|
'unit': 'percentage',
|
|
'aggregation_method': 'AVERAGE',
|
|
'collection_interval_seconds': 3600,
|
|
'warning_threshold': 95.0,
|
|
'critical_threshold': 90.0,
|
|
'is_system_metric': True,
|
|
'related_module': 'sla_oncall'
|
|
},
|
|
{
|
|
'name': 'Security Events',
|
|
'description': 'Number of security events in the last hour',
|
|
'metric_type': 'SECURITY',
|
|
'category': 'SECURITY_EVENTS',
|
|
'unit': 'count',
|
|
'aggregation_method': 'COUNT',
|
|
'collection_interval_seconds': 3600,
|
|
'warning_threshold': 5,
|
|
'critical_threshold': 10,
|
|
'is_system_metric': True,
|
|
'related_module': 'security'
|
|
},
|
|
{
|
|
'name': 'Automation Success Rate',
|
|
'description': 'Percentage of successful automation executions',
|
|
'metric_type': 'BUSINESS',
|
|
'category': 'AUTOMATION_SUCCESS',
|
|
'unit': 'percentage',
|
|
'aggregation_method': 'AVERAGE',
|
|
'collection_interval_seconds': 3600,
|
|
'warning_threshold': 90.0,
|
|
'critical_threshold': 80.0,
|
|
'is_system_metric': True,
|
|
'related_module': 'automation_orchestration'
|
|
},
|
|
{
|
|
'name': 'AI Model Accuracy',
|
|
'description': 'AI model accuracy percentage',
|
|
'metric_type': 'BUSINESS',
|
|
'category': 'AI_ACCURACY',
|
|
'unit': 'percentage',
|
|
'aggregation_method': 'AVERAGE',
|
|
'collection_interval_seconds': 3600,
|
|
'warning_threshold': 85.0,
|
|
'critical_threshold': 75.0,
|
|
'is_system_metric': True,
|
|
'related_module': 'incident_intelligence'
|
|
},
|
|
{
|
|
'name': 'Cost Impact',
|
|
'description': 'Total cost impact in USD for the last 30 days',
|
|
'metric_type': 'BUSINESS',
|
|
'category': 'COST_IMPACT',
|
|
'unit': 'USD',
|
|
'aggregation_method': 'SUM',
|
|
'collection_interval_seconds': 86400,
|
|
'warning_threshold': 10000,
|
|
'critical_threshold': 50000,
|
|
'is_system_metric': True,
|
|
'related_module': 'analytics_predictive_insights'
|
|
},
|
|
{
|
|
'name': 'User Activity',
|
|
'description': 'Number of active users in the last hour',
|
|
'metric_type': 'BUSINESS',
|
|
'category': 'USER_ACTIVITY',
|
|
'unit': 'count',
|
|
'aggregation_method': 'COUNT',
|
|
'collection_interval_seconds': 3600,
|
|
'warning_threshold': 50,
|
|
'critical_threshold': 100,
|
|
'is_system_metric': True
|
|
},
|
|
{
|
|
'name': 'CPU Usage',
|
|
'description': 'System CPU usage percentage',
|
|
'metric_type': 'INFRASTRUCTURE',
|
|
'category': 'SYSTEM_RESOURCES',
|
|
'unit': 'percentage',
|
|
'aggregation_method': 'AVERAGE',
|
|
'collection_interval_seconds': 300,
|
|
'warning_threshold': 80.0,
|
|
'critical_threshold': 90.0,
|
|
'is_system_metric': True
|
|
}
|
|
]
|
|
|
|
for metric_data in metrics:
|
|
metric, created = SystemMetric.objects.get_or_create(
|
|
name=metric_data['name'],
|
|
defaults={
|
|
**metric_data,
|
|
'created_by': admin_user
|
|
}
|
|
)
|
|
if created:
|
|
self.stdout.write(f' Created metric: {metric.name}')
|
|
else:
|
|
self.stdout.write(f' Metric already exists: {metric.name}')
|
|
|
|
def create_default_alert_rules(self, admin_user):
|
|
"""Create default alert rules"""
|
|
self.stdout.write('Creating default alert rules...')
|
|
|
|
# Get metrics for alert rules
|
|
api_response_metric = SystemMetric.objects.filter(name='API Response Time').first()
|
|
error_rate_metric = SystemMetric.objects.filter(name='Error Rate').first()
|
|
availability_metric = SystemMetric.objects.filter(name='System Availability').first()
|
|
incident_count_metric = SystemMetric.objects.filter(name='Incident Count').first()
|
|
mttr_metric = SystemMetric.objects.filter(name='Mean Time to Resolve').first()
|
|
security_events_metric = SystemMetric.objects.filter(name='Security Events').first()
|
|
cpu_metric = SystemMetric.objects.filter(name='CPU Usage').first()
|
|
|
|
alert_rules = [
|
|
{
|
|
'name': 'High API Response Time',
|
|
'description': 'Alert when API response time exceeds threshold',
|
|
'alert_type': 'THRESHOLD',
|
|
'severity': 'HIGH',
|
|
'condition': {
|
|
'type': 'THRESHOLD',
|
|
'operator': '>',
|
|
'threshold': 2000
|
|
},
|
|
'metric': api_response_metric,
|
|
'notification_channels': [
|
|
{
|
|
'type': 'EMAIL',
|
|
'recipients': ['admin@example.com']
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'name': 'High Error Rate',
|
|
'description': 'Alert when error rate exceeds threshold',
|
|
'alert_type': 'THRESHOLD',
|
|
'severity': 'CRITICAL',
|
|
'condition': {
|
|
'type': 'THRESHOLD',
|
|
'operator': '>',
|
|
'threshold': 10.0
|
|
},
|
|
'metric': error_rate_metric,
|
|
'notification_channels': [
|
|
{
|
|
'type': 'EMAIL',
|
|
'recipients': ['admin@example.com']
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'name': 'Low System Availability',
|
|
'description': 'Alert when system availability drops below threshold',
|
|
'alert_type': 'AVAILABILITY',
|
|
'severity': 'CRITICAL',
|
|
'condition': {
|
|
'type': 'THRESHOLD',
|
|
'operator': '<',
|
|
'threshold': 95.0
|
|
},
|
|
'metric': availability_metric,
|
|
'notification_channels': [
|
|
{
|
|
'type': 'EMAIL',
|
|
'recipients': ['admin@example.com']
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'name': 'High Incident Count',
|
|
'description': 'Alert when incident count exceeds threshold',
|
|
'alert_type': 'THRESHOLD',
|
|
'severity': 'HIGH',
|
|
'condition': {
|
|
'type': 'THRESHOLD',
|
|
'operator': '>',
|
|
'threshold': 20
|
|
},
|
|
'metric': incident_count_metric,
|
|
'notification_channels': [
|
|
{
|
|
'type': 'EMAIL',
|
|
'recipients': ['admin@example.com']
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'name': 'High MTTR',
|
|
'description': 'Alert when mean time to resolve exceeds threshold',
|
|
'alert_type': 'THRESHOLD',
|
|
'severity': 'MEDIUM',
|
|
'condition': {
|
|
'type': 'THRESHOLD',
|
|
'operator': '>',
|
|
'threshold': 240
|
|
},
|
|
'metric': mttr_metric,
|
|
'notification_channels': [
|
|
{
|
|
'type': 'EMAIL',
|
|
'recipients': ['admin@example.com']
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'name': 'High Security Events',
|
|
'description': 'Alert when security events exceed threshold',
|
|
'alert_type': 'THRESHOLD',
|
|
'severity': 'HIGH',
|
|
'condition': {
|
|
'type': 'THRESHOLD',
|
|
'operator': '>',
|
|
'threshold': 10
|
|
},
|
|
'metric': security_events_metric,
|
|
'notification_channels': [
|
|
{
|
|
'type': 'EMAIL',
|
|
'recipients': ['admin@example.com']
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'name': 'High CPU Usage',
|
|
'description': 'Alert when CPU usage exceeds threshold',
|
|
'alert_type': 'THRESHOLD',
|
|
'severity': 'HIGH',
|
|
'condition': {
|
|
'type': 'THRESHOLD',
|
|
'operator': '>',
|
|
'threshold': 90.0
|
|
},
|
|
'metric': cpu_metric,
|
|
'notification_channels': [
|
|
{
|
|
'type': 'EMAIL',
|
|
'recipients': ['admin@example.com']
|
|
}
|
|
]
|
|
}
|
|
]
|
|
|
|
for rule_data in alert_rules:
|
|
if rule_data['metric']: # Only create if metric exists
|
|
rule, created = AlertRule.objects.get_or_create(
|
|
name=rule_data['name'],
|
|
defaults={
|
|
**rule_data,
|
|
'created_by': admin_user
|
|
}
|
|
)
|
|
if created:
|
|
self.stdout.write(f' Created alert rule: {rule.name}')
|
|
else:
|
|
self.stdout.write(f' Alert rule already exists: {rule.name}')
|
|
|
|
def create_default_dashboards(self, admin_user):
|
|
"""Create default monitoring dashboards"""
|
|
self.stdout.write('Creating default monitoring dashboards...')
|
|
|
|
dashboards = [
|
|
{
|
|
'name': 'System Overview',
|
|
'description': 'High-level system overview dashboard',
|
|
'dashboard_type': 'SYSTEM_OVERVIEW',
|
|
'is_public': True,
|
|
'auto_refresh_enabled': True,
|
|
'refresh_interval_seconds': 30,
|
|
'layout_config': {
|
|
'columns': 3,
|
|
'rows': 4
|
|
},
|
|
'widget_configs': [
|
|
{
|
|
'type': 'system_status',
|
|
'position': {'x': 0, 'y': 0, 'width': 3, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'health_summary',
|
|
'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'alert_summary',
|
|
'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'system_resources',
|
|
'position': {'x': 2, 'y': 1, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'recent_incidents',
|
|
'position': {'x': 0, 'y': 2, 'width': 2, 'height': 2}
|
|
},
|
|
{
|
|
'type': 'metric_trends',
|
|
'position': {'x': 2, 'y': 2, 'width': 1, 'height': 2}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'name': 'Performance Dashboard',
|
|
'description': 'System performance metrics dashboard',
|
|
'dashboard_type': 'PERFORMANCE',
|
|
'is_public': True,
|
|
'auto_refresh_enabled': True,
|
|
'refresh_interval_seconds': 60,
|
|
'layout_config': {
|
|
'columns': 2,
|
|
'rows': 3
|
|
},
|
|
'widget_configs': [
|
|
{
|
|
'type': 'api_response_time',
|
|
'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'throughput',
|
|
'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'error_rate',
|
|
'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'availability',
|
|
'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'system_resources',
|
|
'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'name': 'Business Metrics Dashboard',
|
|
'description': 'Business and operational metrics dashboard',
|
|
'dashboard_type': 'BUSINESS_METRICS',
|
|
'is_public': True,
|
|
'auto_refresh_enabled': True,
|
|
'refresh_interval_seconds': 300,
|
|
'layout_config': {
|
|
'columns': 2,
|
|
'rows': 3
|
|
},
|
|
'widget_configs': [
|
|
{
|
|
'type': 'incident_count',
|
|
'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'mttr',
|
|
'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'mtta',
|
|
'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'sla_compliance',
|
|
'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'cost_impact',
|
|
'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1}
|
|
}
|
|
]
|
|
},
|
|
{
|
|
'name': 'Security Dashboard',
|
|
'description': 'Security monitoring dashboard',
|
|
'dashboard_type': 'SECURITY',
|
|
'is_public': False,
|
|
'auto_refresh_enabled': True,
|
|
'refresh_interval_seconds': 60,
|
|
'layout_config': {
|
|
'columns': 2,
|
|
'rows': 2
|
|
},
|
|
'widget_configs': [
|
|
{
|
|
'type': 'security_events',
|
|
'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'failed_logins',
|
|
'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'risk_assessments',
|
|
'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
|
|
},
|
|
{
|
|
'type': 'device_posture',
|
|
'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
|
|
}
|
|
]
|
|
}
|
|
]
|
|
|
|
for dashboard_data in dashboards:
|
|
dashboard, created = MonitoringDashboard.objects.get_or_create(
|
|
name=dashboard_data['name'],
|
|
defaults={
|
|
**dashboard_data,
|
|
'created_by': admin_user
|
|
}
|
|
)
|
|
if created:
|
|
self.stdout.write(f' Created dashboard: {dashboard.name}')
|
|
else:
|
|
self.stdout.write(f' Dashboard already exists: {dashboard.name}')
|