This commit is contained in:
Iliyan Angelov
2025-09-19 11:58:53 +03:00
parent 306b20e24a
commit 6b247e5b9f
11423 changed files with 1500615 additions and 778 deletions

View File

@@ -0,0 +1 @@
# Management commands for monitoring

View File

@@ -0,0 +1 @@
# Management commands

View File

@@ -0,0 +1,665 @@
"""
Management command to set up initial monitoring configuration
"""
from django.core.management.base import BaseCommand
from django.contrib.auth import get_user_model
from monitoring.models import (
MonitoringTarget, SystemMetric, AlertRule, MonitoringDashboard
)
User = get_user_model()
class Command(BaseCommand):
help = 'Set up initial monitoring configuration'
def add_arguments(self, parser):
parser.add_argument(
'--admin-user',
type=str,
help='Username of admin user to create monitoring objects',
default='admin'
)
def handle(self, *args, **options):
admin_username = options['admin_user']
try:
admin_user = User.objects.get(username=admin_username)
except User.DoesNotExist:
self.stdout.write(
self.style.ERROR(f'Admin user "{admin_username}" not found')
)
return
self.stdout.write('Setting up monitoring configuration...')
# Create default monitoring targets
self.create_default_targets(admin_user)
# Create default metrics
self.create_default_metrics(admin_user)
# Create default alert rules
self.create_default_alert_rules(admin_user)
# Create default dashboards
self.create_default_dashboards(admin_user)
self.stdout.write(
self.style.SUCCESS('Monitoring configuration setup completed!')
)
def create_default_targets(self, admin_user):
"""Create default monitoring targets"""
self.stdout.write('Creating default monitoring targets...')
targets = [
{
'name': 'Django Application',
'description': 'Main Django application health check',
'target_type': 'APPLICATION',
'endpoint_url': 'http://localhost:8000/health/',
'related_module': 'core',
'health_check_enabled': True,
'expected_status_codes': [200]
},
{
'name': 'Database',
'description': 'Database connection health check',
'target_type': 'DATABASE',
'related_module': 'core',
'health_check_enabled': True
},
{
'name': 'Cache System',
'description': 'Cache system health check',
'target_type': 'CACHE',
'related_module': 'core',
'health_check_enabled': True
},
{
'name': 'Celery Workers',
'description': 'Celery worker health check',
'target_type': 'QUEUE',
'related_module': 'core',
'health_check_enabled': True
},
{
'name': 'Security Module',
'description': 'Security module health check',
'target_type': 'MODULE',
'related_module': 'security',
'health_check_enabled': True
},
{
'name': 'Incident Intelligence Module',
'description': 'Incident Intelligence module health check',
'target_type': 'MODULE',
'related_module': 'incident_intelligence',
'health_check_enabled': True
},
{
'name': 'Automation Orchestration Module',
'description': 'Automation Orchestration module health check',
'target_type': 'MODULE',
'related_module': 'automation_orchestration',
'health_check_enabled': True
},
{
'name': 'SLA OnCall Module',
'description': 'SLA OnCall module health check',
'target_type': 'MODULE',
'related_module': 'sla_oncall',
'health_check_enabled': True
},
{
'name': 'Collaboration War Rooms Module',
'description': 'Collaboration War Rooms module health check',
'target_type': 'MODULE',
'related_module': 'collaboration_war_rooms',
'health_check_enabled': True
},
{
'name': 'Compliance Governance Module',
'description': 'Compliance Governance module health check',
'target_type': 'MODULE',
'related_module': 'compliance_governance',
'health_check_enabled': True
},
{
'name': 'Analytics Predictive Insights Module',
'description': 'Analytics Predictive Insights module health check',
'target_type': 'MODULE',
'related_module': 'analytics_predictive_insights',
'health_check_enabled': True
},
{
'name': 'Knowledge Learning Module',
'description': 'Knowledge Learning module health check',
'target_type': 'MODULE',
'related_module': 'knowledge_learning',
'health_check_enabled': True
}
]
for target_data in targets:
target, created = MonitoringTarget.objects.get_or_create(
name=target_data['name'],
defaults={
**target_data,
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created target: {target.name}')
else:
self.stdout.write(f' Target already exists: {target.name}')
def create_default_metrics(self, admin_user):
"""Create default system metrics"""
self.stdout.write('Creating default system metrics...')
metrics = [
{
'name': 'API Response Time',
'description': 'Average API response time in milliseconds',
'metric_type': 'PERFORMANCE',
'category': 'API_RESPONSE_TIME',
'unit': 'milliseconds',
'aggregation_method': 'AVERAGE',
'collection_interval_seconds': 300,
'warning_threshold': 1000,
'critical_threshold': 2000,
'is_system_metric': True
},
{
'name': 'Request Throughput',
'description': 'Number of requests per minute',
'metric_type': 'PERFORMANCE',
'category': 'THROUGHPUT',
'unit': 'requests/minute',
'aggregation_method': 'SUM',
'collection_interval_seconds': 60,
'warning_threshold': 1000,
'critical_threshold': 2000,
'is_system_metric': True
},
{
'name': 'Error Rate',
'description': 'Percentage of failed requests',
'metric_type': 'PERFORMANCE',
'category': 'ERROR_RATE',
'unit': 'percentage',
'aggregation_method': 'AVERAGE',
'collection_interval_seconds': 300,
'warning_threshold': 5.0,
'critical_threshold': 10.0,
'is_system_metric': True
},
{
'name': 'System Availability',
'description': 'System availability percentage',
'metric_type': 'INFRASTRUCTURE',
'category': 'AVAILABILITY',
'unit': 'percentage',
'aggregation_method': 'AVERAGE',
'collection_interval_seconds': 300,
'warning_threshold': 99.0,
'critical_threshold': 95.0,
'is_system_metric': True
},
{
'name': 'Incident Count',
'description': 'Number of incidents in the last 24 hours',
'metric_type': 'BUSINESS',
'category': 'INCIDENT_COUNT',
'unit': 'count',
'aggregation_method': 'COUNT',
'collection_interval_seconds': 3600,
'warning_threshold': 10,
'critical_threshold': 20,
'is_system_metric': True,
'related_module': 'incident_intelligence'
},
{
'name': 'Mean Time to Resolve',
'description': 'Average time to resolve incidents in minutes',
'metric_type': 'BUSINESS',
'category': 'MTTR',
'unit': 'minutes',
'aggregation_method': 'AVERAGE',
'collection_interval_seconds': 3600,
'warning_threshold': 120,
'critical_threshold': 240,
'is_system_metric': True,
'related_module': 'incident_intelligence'
},
{
'name': 'Mean Time to Acknowledge',
'description': 'Average time to acknowledge incidents in minutes',
'metric_type': 'BUSINESS',
'category': 'MTTA',
'unit': 'minutes',
'aggregation_method': 'AVERAGE',
'collection_interval_seconds': 3600,
'warning_threshold': 15,
'critical_threshold': 30,
'is_system_metric': True,
'related_module': 'incident_intelligence'
},
{
'name': 'SLA Compliance',
'description': 'SLA compliance percentage',
'metric_type': 'BUSINESS',
'category': 'SLA_COMPLIANCE',
'unit': 'percentage',
'aggregation_method': 'AVERAGE',
'collection_interval_seconds': 3600,
'warning_threshold': 95.0,
'critical_threshold': 90.0,
'is_system_metric': True,
'related_module': 'sla_oncall'
},
{
'name': 'Security Events',
'description': 'Number of security events in the last hour',
'metric_type': 'SECURITY',
'category': 'SECURITY_EVENTS',
'unit': 'count',
'aggregation_method': 'COUNT',
'collection_interval_seconds': 3600,
'warning_threshold': 5,
'critical_threshold': 10,
'is_system_metric': True,
'related_module': 'security'
},
{
'name': 'Automation Success Rate',
'description': 'Percentage of successful automation executions',
'metric_type': 'BUSINESS',
'category': 'AUTOMATION_SUCCESS',
'unit': 'percentage',
'aggregation_method': 'AVERAGE',
'collection_interval_seconds': 3600,
'warning_threshold': 90.0,
'critical_threshold': 80.0,
'is_system_metric': True,
'related_module': 'automation_orchestration'
},
{
'name': 'AI Model Accuracy',
'description': 'AI model accuracy percentage',
'metric_type': 'BUSINESS',
'category': 'AI_ACCURACY',
'unit': 'percentage',
'aggregation_method': 'AVERAGE',
'collection_interval_seconds': 3600,
'warning_threshold': 85.0,
'critical_threshold': 75.0,
'is_system_metric': True,
'related_module': 'incident_intelligence'
},
{
'name': 'Cost Impact',
'description': 'Total cost impact in USD for the last 30 days',
'metric_type': 'BUSINESS',
'category': 'COST_IMPACT',
'unit': 'USD',
'aggregation_method': 'SUM',
'collection_interval_seconds': 86400,
'warning_threshold': 10000,
'critical_threshold': 50000,
'is_system_metric': True,
'related_module': 'analytics_predictive_insights'
},
{
'name': 'User Activity',
'description': 'Number of active users in the last hour',
'metric_type': 'BUSINESS',
'category': 'USER_ACTIVITY',
'unit': 'count',
'aggregation_method': 'COUNT',
'collection_interval_seconds': 3600,
'warning_threshold': 50,
'critical_threshold': 100,
'is_system_metric': True
},
{
'name': 'CPU Usage',
'description': 'System CPU usage percentage',
'metric_type': 'INFRASTRUCTURE',
'category': 'SYSTEM_RESOURCES',
'unit': 'percentage',
'aggregation_method': 'AVERAGE',
'collection_interval_seconds': 300,
'warning_threshold': 80.0,
'critical_threshold': 90.0,
'is_system_metric': True
}
]
for metric_data in metrics:
metric, created = SystemMetric.objects.get_or_create(
name=metric_data['name'],
defaults={
**metric_data,
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created metric: {metric.name}')
else:
self.stdout.write(f' Metric already exists: {metric.name}')
def create_default_alert_rules(self, admin_user):
"""Create default alert rules"""
self.stdout.write('Creating default alert rules...')
# Get metrics for alert rules
api_response_metric = SystemMetric.objects.filter(name='API Response Time').first()
error_rate_metric = SystemMetric.objects.filter(name='Error Rate').first()
availability_metric = SystemMetric.objects.filter(name='System Availability').first()
incident_count_metric = SystemMetric.objects.filter(name='Incident Count').first()
mttr_metric = SystemMetric.objects.filter(name='Mean Time to Resolve').first()
security_events_metric = SystemMetric.objects.filter(name='Security Events').first()
cpu_metric = SystemMetric.objects.filter(name='CPU Usage').first()
alert_rules = [
{
'name': 'High API Response Time',
'description': 'Alert when API response time exceeds threshold',
'alert_type': 'THRESHOLD',
'severity': 'HIGH',
'condition': {
'type': 'THRESHOLD',
'operator': '>',
'threshold': 2000
},
'metric': api_response_metric,
'notification_channels': [
{
'type': 'EMAIL',
'recipients': ['admin@example.com']
}
]
},
{
'name': 'High Error Rate',
'description': 'Alert when error rate exceeds threshold',
'alert_type': 'THRESHOLD',
'severity': 'CRITICAL',
'condition': {
'type': 'THRESHOLD',
'operator': '>',
'threshold': 10.0
},
'metric': error_rate_metric,
'notification_channels': [
{
'type': 'EMAIL',
'recipients': ['admin@example.com']
}
]
},
{
'name': 'Low System Availability',
'description': 'Alert when system availability drops below threshold',
'alert_type': 'AVAILABILITY',
'severity': 'CRITICAL',
'condition': {
'type': 'THRESHOLD',
'operator': '<',
'threshold': 95.0
},
'metric': availability_metric,
'notification_channels': [
{
'type': 'EMAIL',
'recipients': ['admin@example.com']
}
]
},
{
'name': 'High Incident Count',
'description': 'Alert when incident count exceeds threshold',
'alert_type': 'THRESHOLD',
'severity': 'HIGH',
'condition': {
'type': 'THRESHOLD',
'operator': '>',
'threshold': 20
},
'metric': incident_count_metric,
'notification_channels': [
{
'type': 'EMAIL',
'recipients': ['admin@example.com']
}
]
},
{
'name': 'High MTTR',
'description': 'Alert when mean time to resolve exceeds threshold',
'alert_type': 'THRESHOLD',
'severity': 'MEDIUM',
'condition': {
'type': 'THRESHOLD',
'operator': '>',
'threshold': 240
},
'metric': mttr_metric,
'notification_channels': [
{
'type': 'EMAIL',
'recipients': ['admin@example.com']
}
]
},
{
'name': 'High Security Events',
'description': 'Alert when security events exceed threshold',
'alert_type': 'THRESHOLD',
'severity': 'HIGH',
'condition': {
'type': 'THRESHOLD',
'operator': '>',
'threshold': 10
},
'metric': security_events_metric,
'notification_channels': [
{
'type': 'EMAIL',
'recipients': ['admin@example.com']
}
]
},
{
'name': 'High CPU Usage',
'description': 'Alert when CPU usage exceeds threshold',
'alert_type': 'THRESHOLD',
'severity': 'HIGH',
'condition': {
'type': 'THRESHOLD',
'operator': '>',
'threshold': 90.0
},
'metric': cpu_metric,
'notification_channels': [
{
'type': 'EMAIL',
'recipients': ['admin@example.com']
}
]
}
]
for rule_data in alert_rules:
if rule_data['metric']: # Only create if metric exists
rule, created = AlertRule.objects.get_or_create(
name=rule_data['name'],
defaults={
**rule_data,
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created alert rule: {rule.name}')
else:
self.stdout.write(f' Alert rule already exists: {rule.name}')
def create_default_dashboards(self, admin_user):
"""Create default monitoring dashboards"""
self.stdout.write('Creating default monitoring dashboards...')
dashboards = [
{
'name': 'System Overview',
'description': 'High-level system overview dashboard',
'dashboard_type': 'SYSTEM_OVERVIEW',
'is_public': True,
'auto_refresh_enabled': True,
'refresh_interval_seconds': 30,
'layout_config': {
'columns': 3,
'rows': 4
},
'widget_configs': [
{
'type': 'system_status',
'position': {'x': 0, 'y': 0, 'width': 3, 'height': 1}
},
{
'type': 'health_summary',
'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
},
{
'type': 'alert_summary',
'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
},
{
'type': 'system_resources',
'position': {'x': 2, 'y': 1, 'width': 1, 'height': 1}
},
{
'type': 'recent_incidents',
'position': {'x': 0, 'y': 2, 'width': 2, 'height': 2}
},
{
'type': 'metric_trends',
'position': {'x': 2, 'y': 2, 'width': 1, 'height': 2}
}
]
},
{
'name': 'Performance Dashboard',
'description': 'System performance metrics dashboard',
'dashboard_type': 'PERFORMANCE',
'is_public': True,
'auto_refresh_enabled': True,
'refresh_interval_seconds': 60,
'layout_config': {
'columns': 2,
'rows': 3
},
'widget_configs': [
{
'type': 'api_response_time',
'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
},
{
'type': 'throughput',
'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
},
{
'type': 'error_rate',
'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
},
{
'type': 'availability',
'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
},
{
'type': 'system_resources',
'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1}
}
]
},
{
'name': 'Business Metrics Dashboard',
'description': 'Business and operational metrics dashboard',
'dashboard_type': 'BUSINESS_METRICS',
'is_public': True,
'auto_refresh_enabled': True,
'refresh_interval_seconds': 300,
'layout_config': {
'columns': 2,
'rows': 3
},
'widget_configs': [
{
'type': 'incident_count',
'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
},
{
'type': 'mttr',
'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
},
{
'type': 'mtta',
'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
},
{
'type': 'sla_compliance',
'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
},
{
'type': 'cost_impact',
'position': {'x': 0, 'y': 2, 'width': 2, 'height': 1}
}
]
},
{
'name': 'Security Dashboard',
'description': 'Security monitoring dashboard',
'dashboard_type': 'SECURITY',
'is_public': False,
'auto_refresh_enabled': True,
'refresh_interval_seconds': 60,
'layout_config': {
'columns': 2,
'rows': 2
},
'widget_configs': [
{
'type': 'security_events',
'position': {'x': 0, 'y': 0, 'width': 1, 'height': 1}
},
{
'type': 'failed_logins',
'position': {'x': 1, 'y': 0, 'width': 1, 'height': 1}
},
{
'type': 'risk_assessments',
'position': {'x': 0, 'y': 1, 'width': 1, 'height': 1}
},
{
'type': 'device_posture',
'position': {'x': 1, 'y': 1, 'width': 1, 'height': 1}
}
]
}
]
for dashboard_data in dashboards:
dashboard, created = MonitoringDashboard.objects.get_or_create(
name=dashboard_data['name'],
defaults={
**dashboard_data,
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created dashboard: {dashboard.name}')
else:
self.stdout.write(f' Dashboard already exists: {dashboard.name}')