This commit is contained in:
Iliyan Angelov
2025-09-19 11:58:53 +03:00
parent 306b20e24a
commit 6b247e5b9f
11423 changed files with 1500615 additions and 778 deletions

319
ETB-API/monitoring/tasks.py Normal file
View File

@@ -0,0 +1,319 @@
"""
Celery tasks for automated monitoring
"""
import logging
from celery import shared_task
from django.utils import timezone
from datetime import timedelta
from monitoring.services.health_checks import HealthCheckService
from monitoring.services.metrics_collector import MetricsCollector
from monitoring.services.alerting import AlertingService
logger = logging.getLogger(__name__)
@shared_task(bind=True, max_retries=3)
def execute_health_checks(self):
"""Execute health checks for all monitoring targets"""
try:
logger.info("Starting health check execution")
health_service = HealthCheckService()
results = health_service.execute_all_health_checks()
logger.info(f"Health checks completed. Results: {len(results)} targets checked")
return {
'status': 'success',
'targets_checked': len(results),
'results': results
}
except Exception as e:
logger.error(f"Health check execution failed: {e}")
# Retry with exponential backoff
if self.request.retries < self.max_retries:
countdown = 2 ** self.request.retries
logger.info(f"Retrying health checks in {countdown} seconds")
raise self.retry(countdown=countdown)
return {
'status': 'error',
'error': str(e)
}
@shared_task(bind=True, max_retries=3)
def collect_metrics(self):
"""Collect metrics from all configured sources"""
try:
logger.info("Starting metrics collection")
collector = MetricsCollector()
results = collector.collect_all_metrics()
successful_metrics = len([r for r in results.values() if 'error' not in r])
failed_metrics = len([r for r in results.values() if 'error' in r])
logger.info(f"Metrics collection completed. Success: {successful_metrics}, Failed: {failed_metrics}")
return {
'status': 'success',
'successful_metrics': successful_metrics,
'failed_metrics': failed_metrics,
'results': results
}
except Exception as e:
logger.error(f"Metrics collection failed: {e}")
# Retry with exponential backoff
if self.request.retries < self.max_retries:
countdown = 2 ** self.request.retries
logger.info(f"Retrying metrics collection in {countdown} seconds")
raise self.retry(countdown=countdown)
return {
'status': 'error',
'error': str(e)
}
@shared_task(bind=True, max_retries=3)
def evaluate_alerts(self):
"""Evaluate alert rules and send notifications"""
try:
logger.info("Starting alert evaluation")
alerting_service = AlertingService()
results = alerting_service.run_alert_evaluation()
logger.info(f"Alert evaluation completed. Triggered: {results['triggered_alerts']}, Notifications: {results['notifications_sent']}")
return results
except Exception as e:
logger.error(f"Alert evaluation failed: {e}")
# Retry with exponential backoff
if self.request.retries < self.max_retries:
countdown = 2 ** self.request.retries
logger.info(f"Retrying alert evaluation in {countdown} seconds")
raise self.retry(countdown=countdown)
return {
'status': 'error',
'error': str(e)
}
@shared_task(bind=True, max_retries=3)
def cleanup_old_data(self):
"""Clean up old monitoring data"""
try:
logger.info("Starting data cleanup")
from monitoring.models import HealthCheck, MetricMeasurement, Alert
# Clean up old health checks (keep last 7 days)
cutoff_date = timezone.now() - timedelta(days=7)
old_health_checks = HealthCheck.objects.filter(checked_at__lt=cutoff_date)
health_checks_deleted = old_health_checks.count()
old_health_checks.delete()
# Clean up old metric measurements (keep last 90 days)
cutoff_date = timezone.now() - timedelta(days=90)
old_measurements = MetricMeasurement.objects.filter(timestamp__lt=cutoff_date)
measurements_deleted = old_measurements.count()
old_measurements.delete()
# Clean up resolved alerts older than 30 days
cutoff_date = timezone.now() - timedelta(days=30)
old_alerts = Alert.objects.filter(
status='RESOLVED',
resolved_at__lt=cutoff_date
)
alerts_deleted = old_alerts.count()
old_alerts.delete()
logger.info(f"Data cleanup completed. Health checks: {health_checks_deleted}, Measurements: {measurements_deleted}, Alerts: {alerts_deleted}")
return {
'status': 'success',
'health_checks_deleted': health_checks_deleted,
'measurements_deleted': measurements_deleted,
'alerts_deleted': alerts_deleted
}
except Exception as e:
logger.error(f"Data cleanup failed: {e}")
# Retry with exponential backoff
if self.request.retries < self.max_retries:
countdown = 2 ** self.request.retries
logger.info(f"Retrying data cleanup in {countdown} seconds")
raise self.retry(countdown=countdown)
return {
'status': 'error',
'error': str(e)
}
@shared_task(bind=True, max_retries=3)
def generate_system_status_report(self):
"""Generate system status report"""
try:
logger.info("Generating system status report")
from monitoring.models import SystemStatus
from monitoring.services.health_checks import HealthCheckService
health_service = HealthCheckService()
health_summary = health_service.get_system_health_summary()
# Determine overall system status
if health_summary['critical_targets'] > 0:
status = 'MAJOR_OUTAGE'
message = f"Critical issues detected in {health_summary['critical_targets']} systems"
elif health_summary['warning_targets'] > 0:
status = 'DEGRADED'
message = f"Performance issues detected in {health_summary['warning_targets']} systems"
else:
status = 'OPERATIONAL'
message = "All systems operational"
# Create system status record
system_status = SystemStatus.objects.create(
status=status,
message=message,
affected_services=[] # Would be populated based on actual issues
)
logger.info(f"System status report generated: {status}")
return {
'status': 'success',
'system_status': status,
'message': message,
'health_summary': health_summary
}
except Exception as e:
logger.error(f"System status report generation failed: {e}")
# Retry with exponential backoff
if self.request.retries < self.max_retries:
countdown = 2 ** self.request.retries
logger.info(f"Retrying system status report in {countdown} seconds")
raise self.retry(countdown=countdown)
return {
'status': 'error',
'error': str(e)
}
@shared_task(bind=True, max_retries=3)
def monitor_external_integrations(self):
"""Monitor external integrations and services"""
try:
logger.info("Starting external integrations monitoring")
from monitoring.models import MonitoringTarget
from monitoring.services.health_checks import HealthCheckService
health_service = HealthCheckService()
# Get external integration targets
external_targets = MonitoringTarget.objects.filter(
target_type='EXTERNAL_API',
status='ACTIVE'
)
results = {}
for target in external_targets:
try:
result = health_service.execute_health_check(target, 'HTTP')
results[target.name] = result
# Log integration status
if result['status'] == 'CRITICAL':
logger.warning(f"External integration {target.name} is critical")
elif result['status'] == 'WARNING':
logger.info(f"External integration {target.name} has warnings")
except Exception as e:
logger.error(f"Failed to check external integration {target.name}: {e}")
results[target.name] = {'status': 'CRITICAL', 'error': str(e)}
logger.info(f"External integrations monitoring completed. Checked: {len(results)} integrations")
return {
'status': 'success',
'integrations_checked': len(results),
'results': results
}
except Exception as e:
logger.error(f"External integrations monitoring failed: {e}")
# Retry with exponential backoff
if self.request.retries < self.max_retries:
countdown = 2 ** self.request.retries
logger.info(f"Retrying external integrations monitoring in {countdown} seconds")
raise self.retry(countdown=countdown)
return {
'status': 'error',
'error': str(e)
}
@shared_task(bind=True, max_retries=3)
def update_monitoring_dashboards(self):
"""Update monitoring dashboards with latest data"""
try:
logger.info("Updating monitoring dashboards")
from monitoring.models import MonitoringDashboard
from monitoring.services.metrics_collector import MetricsAggregator
aggregator = MetricsAggregator()
# Get active dashboards
active_dashboards = MonitoringDashboard.objects.filter(is_active=True)
updated_dashboards = 0
for dashboard in active_dashboards:
try:
# Update dashboard data (this would typically involve caching or real-time updates)
# For now, just log the update
logger.info(f"Updating dashboard: {dashboard.name}")
updated_dashboards += 1
except Exception as e:
logger.error(f"Failed to update dashboard {dashboard.name}: {e}")
logger.info(f"Dashboard updates completed. Updated: {updated_dashboards} dashboards")
return {
'status': 'success',
'dashboards_updated': updated_dashboards
}
except Exception as e:
logger.error(f"Dashboard update failed: {e}")
# Retry with exponential backoff
if self.request.retries < self.max_retries:
countdown = 2 ** self.request.retries
logger.info(f"Retrying dashboard update in {countdown} seconds")
raise self.retry(countdown=countdown)
return {
'status': 'error',
'error': str(e)
}