Files
ETB/ETB-API/monitoring/models.py
Iliyan Angelov 6b247e5b9f Updates
2025-09-19 11:58:53 +03:00

516 lines
16 KiB
Python

"""
Monitoring models for comprehensive system observability
"""
import uuid
import json
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, List
from decimal import Decimal
from django.db import models
from django.contrib.auth import get_user_model
from django.core.validators import MinValueValidator, MaxValueValidator
from django.utils import timezone
from django.core.exceptions import ValidationError
User = get_user_model()
class MonitoringTarget(models.Model):
"""Target systems, services, or components to monitor"""
TARGET_TYPES = [
('APPLICATION', 'Application'),
('DATABASE', 'Database'),
('CACHE', 'Cache'),
('QUEUE', 'Message Queue'),
('EXTERNAL_API', 'External API'),
('SERVICE', 'Internal Service'),
('INFRASTRUCTURE', 'Infrastructure'),
('MODULE', 'Django Module'),
]
STATUS_CHOICES = [
('ACTIVE', 'Active'),
('INACTIVE', 'Inactive'),
('MAINTENANCE', 'Maintenance'),
('ERROR', 'Error'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200, unique=True)
description = models.TextField()
target_type = models.CharField(max_length=20, choices=TARGET_TYPES)
# Connection details
endpoint_url = models.URLField(blank=True, null=True)
connection_config = models.JSONField(
default=dict,
help_text="Connection configuration (credentials, timeouts, etc.)"
)
# Monitoring configuration
check_interval_seconds = models.PositiveIntegerField(default=60)
timeout_seconds = models.PositiveIntegerField(default=30)
retry_count = models.PositiveIntegerField(default=3)
# Health check configuration
health_check_enabled = models.BooleanField(default=True)
health_check_endpoint = models.CharField(max_length=200, blank=True, null=True)
expected_status_codes = models.JSONField(
default=list,
help_text="Expected HTTP status codes for health checks"
)
# Status and metadata
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='ACTIVE')
last_checked = models.DateTimeField(null=True, blank=True)
last_status = models.CharField(max_length=20, choices=[
('HEALTHY', 'Healthy'),
('WARNING', 'Warning'),
('CRITICAL', 'Critical'),
('UNKNOWN', 'Unknown'),
], default='UNKNOWN')
# Related module (if applicable)
related_module = models.CharField(
max_length=50,
blank=True,
null=True,
help_text="Related Django module (e.g., 'security', 'incident_intelligence')"
)
# Metadata
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['target_type', 'status']),
models.Index(fields=['related_module']),
models.Index(fields=['last_checked']),
]
def __str__(self):
return f"{self.name} ({self.target_type})"
class HealthCheck(models.Model):
"""Individual health check results"""
CHECK_TYPES = [
('HTTP', 'HTTP Health Check'),
('DATABASE', 'Database Connection'),
('CACHE', 'Cache Connection'),
('QUEUE', 'Message Queue'),
('CUSTOM', 'Custom Check'),
('PING', 'Network Ping'),
('SSL', 'SSL Certificate'),
]
STATUS_CHOICES = [
('HEALTHY', 'Healthy'),
('WARNING', 'Warning'),
('CRITICAL', 'Critical'),
('UNKNOWN', 'Unknown'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
target = models.ForeignKey(MonitoringTarget, on_delete=models.CASCADE, related_name='health_checks')
# Check details
check_type = models.CharField(max_length=20, choices=CHECK_TYPES)
status = models.CharField(max_length=20, choices=STATUS_CHOICES)
response_time_ms = models.PositiveIntegerField(null=True, blank=True)
# Response details
status_code = models.PositiveIntegerField(null=True, blank=True)
response_body = models.TextField(blank=True, null=True)
error_message = models.TextField(blank=True, null=True)
# Metrics
cpu_usage_percent = models.FloatField(null=True, blank=True)
memory_usage_percent = models.FloatField(null=True, blank=True)
disk_usage_percent = models.FloatField(null=True, blank=True)
# Timestamps
checked_at = models.DateTimeField(auto_now_add=True)
class Meta:
ordering = ['-checked_at']
indexes = [
models.Index(fields=['target', 'checked_at']),
models.Index(fields=['status', 'checked_at']),
models.Index(fields=['check_type']),
]
def __str__(self):
return f"{self.target.name} - {self.status} ({self.checked_at})"
class SystemMetric(models.Model):
"""System performance and operational metrics"""
METRIC_TYPES = [
('PERFORMANCE', 'Performance Metric'),
('BUSINESS', 'Business Metric'),
('SECURITY', 'Security Metric'),
('INFRASTRUCTURE', 'Infrastructure Metric'),
('CUSTOM', 'Custom Metric'),
]
METRIC_CATEGORIES = [
('API_RESPONSE_TIME', 'API Response Time'),
('THROUGHPUT', 'Throughput'),
('ERROR_RATE', 'Error Rate'),
('AVAILABILITY', 'Availability'),
('INCIDENT_COUNT', 'Incident Count'),
('MTTR', 'Mean Time to Resolve'),
('MTTA', 'Mean Time to Acknowledge'),
('SLA_COMPLIANCE', 'SLA Compliance'),
('SECURITY_EVENTS', 'Security Events'),
('AUTOMATION_SUCCESS', 'Automation Success Rate'),
('AI_ACCURACY', 'AI Model Accuracy'),
('COST_IMPACT', 'Cost Impact'),
('USER_ACTIVITY', 'User Activity'),
('SYSTEM_RESOURCES', 'System Resources'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200)
description = models.TextField()
metric_type = models.CharField(max_length=20, choices=METRIC_TYPES)
category = models.CharField(max_length=30, choices=METRIC_CATEGORIES)
# Metric configuration
unit = models.CharField(max_length=50, help_text="Unit of measurement")
aggregation_method = models.CharField(
max_length=20,
choices=[
('AVERAGE', 'Average'),
('SUM', 'Sum'),
('COUNT', 'Count'),
('MIN', 'Minimum'),
('MAX', 'Maximum'),
('PERCENTILE_95', '95th Percentile'),
('PERCENTILE_99', '99th Percentile'),
]
)
# Collection configuration
collection_interval_seconds = models.PositiveIntegerField(default=300) # 5 minutes
retention_days = models.PositiveIntegerField(default=90)
# Thresholds
warning_threshold = models.FloatField(null=True, blank=True)
critical_threshold = models.FloatField(null=True, blank=True)
# Status
is_active = models.BooleanField(default=True)
is_system_metric = models.BooleanField(default=False)
# Related module
related_module = models.CharField(
max_length=50,
blank=True,
null=True,
help_text="Related Django module"
)
# Metadata
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['metric_type', 'category']),
models.Index(fields=['related_module']),
models.Index(fields=['is_active']),
]
def __str__(self):
return f"{self.name} ({self.category})"
class MetricMeasurement(models.Model):
"""Individual metric measurements"""
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
metric = models.ForeignKey(SystemMetric, on_delete=models.CASCADE, related_name='measurements')
# Measurement details
value = models.DecimalField(max_digits=15, decimal_places=4)
timestamp = models.DateTimeField(auto_now_add=True)
# Context
tags = models.JSONField(
default=dict,
help_text="Additional tags for this measurement"
)
metadata = models.JSONField(
default=dict,
help_text="Additional metadata"
)
class Meta:
ordering = ['-timestamp']
indexes = [
models.Index(fields=['metric', 'timestamp']),
models.Index(fields=['timestamp']),
]
def __str__(self):
return f"{self.metric.name}: {self.value} ({self.timestamp})"
class AlertRule(models.Model):
"""Alert rules for monitoring thresholds"""
ALERT_TYPES = [
('THRESHOLD', 'Threshold Alert'),
('ANOMALY', 'Anomaly Alert'),
('PATTERN', 'Pattern Alert'),
('AVAILABILITY', 'Availability Alert'),
('PERFORMANCE', 'Performance Alert'),
]
SEVERITY_CHOICES = [
('LOW', 'Low'),
('MEDIUM', 'Medium'),
('HIGH', 'High'),
('CRITICAL', 'Critical'),
]
STATUS_CHOICES = [
('ACTIVE', 'Active'),
('INACTIVE', 'Inactive'),
('MAINTENANCE', 'Maintenance'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200)
description = models.TextField()
alert_type = models.CharField(max_length=20, choices=ALERT_TYPES)
severity = models.CharField(max_length=20, choices=SEVERITY_CHOICES)
# Rule configuration
condition = models.JSONField(
help_text="Alert condition configuration"
)
evaluation_interval_seconds = models.PositiveIntegerField(default=60)
# Related objects
metric = models.ForeignKey(
SystemMetric,
on_delete=models.CASCADE,
null=True,
blank=True,
related_name='alert_rules'
)
target = models.ForeignKey(
MonitoringTarget,
on_delete=models.CASCADE,
null=True,
blank=True,
related_name='alert_rules'
)
# Notification configuration
notification_channels = models.JSONField(
default=list,
help_text="List of notification channels (email, slack, webhook, etc.)"
)
notification_template = models.TextField(
blank=True,
null=True,
help_text="Custom notification template"
)
# Status
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='ACTIVE')
is_enabled = models.BooleanField(default=True)
# Metadata
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['alert_type', 'severity']),
models.Index(fields=['status', 'is_enabled']),
]
def __str__(self):
return f"{self.name} ({self.severity})"
class Alert(models.Model):
"""Alert instances"""
STATUS_CHOICES = [
('TRIGGERED', 'Triggered'),
('ACKNOWLEDGED', 'Acknowledged'),
('RESOLVED', 'Resolved'),
('SUPPRESSED', 'Suppressed'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
rule = models.ForeignKey(AlertRule, on_delete=models.CASCADE, related_name='alerts')
# Alert details
title = models.CharField(max_length=200)
description = models.TextField()
severity = models.CharField(max_length=20, choices=AlertRule.SEVERITY_CHOICES)
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='TRIGGERED')
# Context
triggered_value = models.DecimalField(max_digits=15, decimal_places=4, null=True, blank=True)
threshold_value = models.DecimalField(max_digits=15, decimal_places=4, null=True, blank=True)
context_data = models.JSONField(
default=dict,
help_text="Additional context data for the alert"
)
# Timestamps
triggered_at = models.DateTimeField(auto_now_add=True)
acknowledged_at = models.DateTimeField(null=True, blank=True)
resolved_at = models.DateTimeField(null=True, blank=True)
# Assignment
acknowledged_by = models.ForeignKey(
User,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='acknowledged_alerts'
)
resolved_by = models.ForeignKey(
User,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='resolved_alerts'
)
class Meta:
ordering = ['-triggered_at']
indexes = [
models.Index(fields=['rule', 'status']),
models.Index(fields=['severity', 'status']),
models.Index(fields=['triggered_at']),
]
def __str__(self):
return f"{self.title} ({self.severity}) - {self.status}"
class MonitoringDashboard(models.Model):
"""Monitoring dashboard configurations"""
DASHBOARD_TYPES = [
('SYSTEM_OVERVIEW', 'System Overview'),
('PERFORMANCE', 'Performance'),
('BUSINESS_METRICS', 'Business Metrics'),
('SECURITY', 'Security'),
('INFRASTRUCTURE', 'Infrastructure'),
('CUSTOM', 'Custom'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200)
description = models.TextField()
dashboard_type = models.CharField(max_length=20, choices=DASHBOARD_TYPES)
# Dashboard configuration
layout_config = models.JSONField(
default=dict,
help_text="Dashboard layout configuration"
)
widget_configs = models.JSONField(
default=list,
help_text="Configuration for dashboard widgets"
)
# Access control
is_public = models.BooleanField(default=False)
allowed_users = models.ManyToManyField(
User,
blank=True,
related_name='accessible_monitoring_dashboards'
)
allowed_roles = models.JSONField(
default=list,
help_text="List of roles that can access this dashboard"
)
# Refresh configuration
auto_refresh_enabled = models.BooleanField(default=True)
refresh_interval_seconds = models.PositiveIntegerField(default=30)
# Status
is_active = models.BooleanField(default=True)
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['dashboard_type', 'is_active']),
models.Index(fields=['is_public']),
]
def __str__(self):
return f"{self.name} ({self.dashboard_type})"
class SystemStatus(models.Model):
"""Overall system status tracking"""
STATUS_CHOICES = [
('OPERATIONAL', 'Operational'),
('DEGRADED', 'Degraded'),
('PARTIAL_OUTAGE', 'Partial Outage'),
('MAJOR_OUTAGE', 'Major Outage'),
('MAINTENANCE', 'Maintenance'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
status = models.CharField(max_length=20, choices=STATUS_CHOICES)
message = models.TextField(help_text="Status message for users")
# Impact details
affected_services = models.JSONField(
default=list,
help_text="List of affected services"
)
estimated_resolution = models.DateTimeField(null=True, blank=True)
# Timestamps
started_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
resolved_at = models.DateTimeField(null=True, blank=True)
# Metadata
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
class Meta:
ordering = ['-started_at']
indexes = [
models.Index(fields=['status', 'started_at']),
models.Index(fields=['started_at']),
]
def __str__(self):
return f"System Status: {self.status} ({self.started_at})"
@property
def is_resolved(self):
return self.resolved_at is not None