604 lines
22 KiB
Python
604 lines
22 KiB
Python
"""
|
|
Comprehensive Health Check System for ETB-API
|
|
Enterprise-grade health monitoring with detailed diagnostics
|
|
"""
|
|
import time
|
|
import psutil
|
|
import logging
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Any, List, Optional
|
|
from django.http import JsonResponse
|
|
from django.db import connection
|
|
from django.core.cache import cache
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
from django.core.exceptions import ImproperlyConfigured
|
|
import redis
|
|
import requests
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class HealthCheckService:
|
|
"""Enterprise health check service with comprehensive diagnostics"""
|
|
|
|
def __init__(self):
|
|
self.checks = {
|
|
'database': self._check_database,
|
|
'cache': self._check_cache,
|
|
'celery': self._check_celery,
|
|
'redis': self._check_redis,
|
|
'disk_space': self._check_disk_space,
|
|
'memory': self._check_memory,
|
|
'cpu': self._check_cpu,
|
|
'external_services': self._check_external_services,
|
|
'modules': self._check_modules,
|
|
'security': self._check_security,
|
|
}
|
|
|
|
def perform_health_check(self, checks: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
"""Perform comprehensive health check"""
|
|
start_time = time.time()
|
|
results = {
|
|
'status': 'healthy',
|
|
'timestamp': timezone.now().isoformat(),
|
|
'version': getattr(settings, 'VERSION', '1.0.0'),
|
|
'environment': 'production' if not settings.DEBUG else 'development',
|
|
'checks': {},
|
|
'summary': {
|
|
'total_checks': 0,
|
|
'passed_checks': 0,
|
|
'failed_checks': 0,
|
|
'warning_checks': 0,
|
|
},
|
|
'performance': {
|
|
'response_time_ms': 0,
|
|
'memory_usage_mb': 0,
|
|
'cpu_usage_percent': 0,
|
|
}
|
|
}
|
|
|
|
# Determine which checks to run
|
|
checks_to_run = checks or list(self.checks.keys())
|
|
|
|
# Run each health check
|
|
for check_name in checks_to_run:
|
|
if check_name in self.checks:
|
|
try:
|
|
check_result = self.checks[check_name]()
|
|
results['checks'][check_name] = check_result
|
|
results['summary']['total_checks'] += 1
|
|
|
|
if check_result['status'] == 'healthy':
|
|
results['summary']['passed_checks'] += 1
|
|
elif check_result['status'] == 'warning':
|
|
results['summary']['warning_checks'] += 1
|
|
else:
|
|
results['summary']['failed_checks'] += 1
|
|
results['status'] = 'unhealthy'
|
|
|
|
except Exception as e:
|
|
logger.error(f"Health check {check_name} failed: {str(e)}")
|
|
results['checks'][check_name] = {
|
|
'status': 'unhealthy',
|
|
'message': f'Check failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
results['summary']['total_checks'] += 1
|
|
results['summary']['failed_checks'] += 1
|
|
results['status'] = 'unhealthy'
|
|
|
|
# Calculate performance metrics
|
|
end_time = time.time()
|
|
results['performance']['response_time_ms'] = round((end_time - start_time) * 1000, 2)
|
|
results['performance']['memory_usage_mb'] = round(psutil.Process().memory_info().rss / 1024 / 1024, 2)
|
|
results['performance']['cpu_usage_percent'] = round(psutil.cpu_percent(), 2)
|
|
|
|
# Determine overall status
|
|
if results['summary']['failed_checks'] > 0:
|
|
results['status'] = 'unhealthy'
|
|
elif results['summary']['warning_checks'] > 0:
|
|
results['status'] = 'degraded'
|
|
|
|
return results
|
|
|
|
def _check_database(self) -> Dict[str, Any]:
|
|
"""Check database connectivity and performance"""
|
|
try:
|
|
start_time = time.time()
|
|
|
|
with connection.cursor() as cursor:
|
|
# Test basic connectivity
|
|
cursor.execute("SELECT 1")
|
|
result = cursor.fetchone()
|
|
|
|
# Test database performance
|
|
cursor.execute("SELECT COUNT(*) FROM django_migrations")
|
|
migration_count = cursor.fetchone()[0]
|
|
|
|
# Check for long-running queries
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM pg_stat_activity
|
|
WHERE state = 'active' AND query_start < NOW() - INTERVAL '30 seconds'
|
|
""")
|
|
long_queries = cursor.fetchone()[0]
|
|
|
|
response_time = (time.time() - start_time) * 1000
|
|
|
|
status = 'healthy'
|
|
message = 'Database is healthy'
|
|
|
|
if response_time > 1000: # 1 second
|
|
status = 'warning'
|
|
message = 'Database response time is slow'
|
|
elif long_queries > 5:
|
|
status = 'warning'
|
|
message = 'Multiple long-running queries detected'
|
|
|
|
return {
|
|
'status': status,
|
|
'message': message,
|
|
'response_time_ms': round(response_time, 2),
|
|
'migration_count': migration_count,
|
|
'long_running_queries': long_queries,
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'Database connection failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
def _check_cache(self) -> Dict[str, Any]:
|
|
"""Check cache connectivity and performance"""
|
|
try:
|
|
start_time = time.time()
|
|
|
|
# Test cache write/read
|
|
test_key = 'health_check_test'
|
|
test_value = f'test_{time.time()}'
|
|
|
|
cache.set(test_key, test_value, 30)
|
|
retrieved_value = cache.get(test_key)
|
|
|
|
response_time = (time.time() - start_time) * 1000
|
|
|
|
if retrieved_value != test_value:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': 'Cache read/write test failed',
|
|
'response_time_ms': round(response_time, 2),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
status = 'healthy'
|
|
message = 'Cache is healthy'
|
|
|
|
if response_time > 100: # 100ms
|
|
status = 'warning'
|
|
message = 'Cache response time is slow'
|
|
|
|
return {
|
|
'status': status,
|
|
'message': message,
|
|
'response_time_ms': round(response_time, 2),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'Cache connection failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
def _check_celery(self) -> Dict[str, Any]:
|
|
"""Check Celery worker status and queue health"""
|
|
try:
|
|
from celery import current_app
|
|
|
|
# Get worker status
|
|
inspect = current_app.control.inspect()
|
|
active_workers = inspect.active()
|
|
scheduled_tasks = inspect.scheduled()
|
|
registered_tasks = inspect.registered()
|
|
|
|
worker_count = len(active_workers) if active_workers else 0
|
|
total_active_tasks = sum(len(tasks) for tasks in (active_workers or {}).values())
|
|
total_scheduled_tasks = sum(len(tasks) for tasks in (scheduled_tasks or {}).values())
|
|
|
|
status = 'healthy'
|
|
message = 'Celery workers are healthy'
|
|
|
|
if worker_count == 0:
|
|
status = 'unhealthy'
|
|
message = 'No active Celery workers found'
|
|
elif total_active_tasks > 100:
|
|
status = 'warning'
|
|
message = 'High number of active tasks detected'
|
|
elif total_scheduled_tasks > 50:
|
|
status = 'warning'
|
|
message = 'High number of scheduled tasks detected'
|
|
|
|
return {
|
|
'status': status,
|
|
'message': message,
|
|
'worker_count': worker_count,
|
|
'active_tasks': total_active_tasks,
|
|
'scheduled_tasks': total_scheduled_tasks,
|
|
'registered_tasks': len(registered_tasks.get(list(registered_tasks.keys())[0], [])) if registered_tasks else 0,
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'Celery check failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
def _check_redis(self) -> Dict[str, Any]:
|
|
"""Check Redis connectivity and performance"""
|
|
try:
|
|
start_time = time.time()
|
|
|
|
# Connect to Redis
|
|
redis_url = getattr(settings, 'CELERY_BROKER_URL', 'redis://localhost:6379/0')
|
|
r = redis.from_url(redis_url)
|
|
|
|
# Test basic operations
|
|
test_key = 'health_check_redis'
|
|
test_value = f'test_{time.time()}'
|
|
|
|
r.set(test_key, test_value, ex=30)
|
|
retrieved_value = r.get(test_key)
|
|
|
|
# Get Redis info
|
|
info = r.info()
|
|
|
|
response_time = (time.time() - start_time) * 1000
|
|
|
|
if retrieved_value.decode() != test_value:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': 'Redis read/write test failed',
|
|
'response_time_ms': round(response_time, 2),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
status = 'healthy'
|
|
message = 'Redis is healthy'
|
|
|
|
if response_time > 50: # 50ms
|
|
status = 'warning'
|
|
message = 'Redis response time is slow'
|
|
|
|
return {
|
|
'status': status,
|
|
'message': message,
|
|
'response_time_ms': round(response_time, 2),
|
|
'redis_version': info.get('redis_version'),
|
|
'used_memory_human': info.get('used_memory_human'),
|
|
'connected_clients': info.get('connected_clients'),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'Redis connection failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
def _check_disk_space(self) -> Dict[str, Any]:
|
|
"""Check disk space usage"""
|
|
try:
|
|
disk_usage = psutil.disk_usage('/')
|
|
total_gb = disk_usage.total / (1024**3)
|
|
used_gb = disk_usage.used / (1024**3)
|
|
free_gb = disk_usage.free / (1024**3)
|
|
usage_percent = (used_gb / total_gb) * 100
|
|
|
|
status = 'healthy'
|
|
message = 'Disk space is healthy'
|
|
|
|
if usage_percent > 90:
|
|
status = 'unhealthy'
|
|
message = 'Disk space critically low'
|
|
elif usage_percent > 80:
|
|
status = 'warning'
|
|
message = 'Disk space usage is high'
|
|
|
|
return {
|
|
'status': status,
|
|
'message': message,
|
|
'total_gb': round(total_gb, 2),
|
|
'used_gb': round(used_gb, 2),
|
|
'free_gb': round(free_gb, 2),
|
|
'usage_percent': round(usage_percent, 2),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'Disk space check failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
def _check_memory(self) -> Dict[str, Any]:
|
|
"""Check memory usage"""
|
|
try:
|
|
memory = psutil.virtual_memory()
|
|
total_gb = memory.total / (1024**3)
|
|
used_gb = memory.used / (1024**3)
|
|
available_gb = memory.available / (1024**3)
|
|
usage_percent = memory.percent
|
|
|
|
status = 'healthy'
|
|
message = 'Memory usage is healthy'
|
|
|
|
if usage_percent > 90:
|
|
status = 'unhealthy'
|
|
message = 'Memory usage critically high'
|
|
elif usage_percent > 80:
|
|
status = 'warning'
|
|
message = 'Memory usage is high'
|
|
|
|
return {
|
|
'status': status,
|
|
'message': message,
|
|
'total_gb': round(total_gb, 2),
|
|
'used_gb': round(used_gb, 2),
|
|
'available_gb': round(available_gb, 2),
|
|
'usage_percent': round(usage_percent, 2),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'Memory check failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
def _check_cpu(self) -> Dict[str, Any]:
|
|
"""Check CPU usage"""
|
|
try:
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
cpu_count = psutil.cpu_count()
|
|
load_avg = psutil.getloadavg() if hasattr(psutil, 'getloadavg') else [0, 0, 0]
|
|
|
|
status = 'healthy'
|
|
message = 'CPU usage is healthy'
|
|
|
|
if cpu_percent > 90:
|
|
status = 'unhealthy'
|
|
message = 'CPU usage critically high'
|
|
elif cpu_percent > 80:
|
|
status = 'warning'
|
|
message = 'CPU usage is high'
|
|
|
|
return {
|
|
'status': status,
|
|
'message': message,
|
|
'cpu_percent': round(cpu_percent, 2),
|
|
'cpu_count': cpu_count,
|
|
'load_avg_1min': round(load_avg[0], 2),
|
|
'load_avg_5min': round(load_avg[1], 2),
|
|
'load_avg_15min': round(load_avg[2], 2),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'CPU check failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
def _check_external_services(self) -> Dict[str, Any]:
|
|
"""Check external service dependencies"""
|
|
try:
|
|
services = {
|
|
'email': {
|
|
'url': f'smtp://{getattr(settings, "EMAIL_HOST", "localhost")}:{getattr(settings, "EMAIL_PORT", "587")}',
|
|
'timeout': 5,
|
|
},
|
|
}
|
|
|
|
results = {}
|
|
overall_status = 'healthy'
|
|
|
|
for service_name, config in services.items():
|
|
try:
|
|
# This is a simplified check - in production, you'd implement actual service checks
|
|
results[service_name] = {
|
|
'status': 'healthy',
|
|
'message': f'{service_name} service is accessible',
|
|
'response_time_ms': 0,
|
|
}
|
|
except Exception as e:
|
|
results[service_name] = {
|
|
'status': 'unhealthy',
|
|
'message': f'{service_name} service check failed: {str(e)}',
|
|
'error': str(e),
|
|
}
|
|
overall_status = 'unhealthy'
|
|
|
|
return {
|
|
'status': overall_status,
|
|
'message': 'External services check completed',
|
|
'services': results,
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'External services check failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
def _check_modules(self) -> Dict[str, Any]:
|
|
"""Check Django modules and apps"""
|
|
try:
|
|
from django.apps import apps
|
|
|
|
installed_apps = []
|
|
module_status = {}
|
|
|
|
for app_config in apps.get_app_configs():
|
|
app_name = app_config.name
|
|
installed_apps.append(app_name)
|
|
|
|
try:
|
|
# Check if app has models
|
|
models = app_config.get_models()
|
|
model_count = len(models)
|
|
|
|
# Check if app has migrations
|
|
from django.db import connection
|
|
with connection.cursor() as cursor:
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM django_migrations
|
|
WHERE app = %s
|
|
""", [app_name.split('.')[-1]])
|
|
migration_count = cursor.fetchone()[0]
|
|
|
|
module_status[app_name] = {
|
|
'status': 'healthy',
|
|
'model_count': model_count,
|
|
'migration_count': migration_count,
|
|
}
|
|
|
|
except Exception as e:
|
|
module_status[app_name] = {
|
|
'status': 'warning',
|
|
'error': str(e),
|
|
}
|
|
|
|
return {
|
|
'status': 'healthy',
|
|
'message': 'All modules are healthy',
|
|
'installed_apps': installed_apps,
|
|
'module_status': module_status,
|
|
'total_apps': len(installed_apps),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'Module check failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
def _check_security(self) -> Dict[str, Any]:
|
|
"""Check security-related configurations"""
|
|
try:
|
|
security_checks = {
|
|
'debug_mode': not settings.DEBUG,
|
|
'secret_key_set': bool(settings.SECRET_KEY and settings.SECRET_KEY != 'django-insecure-'),
|
|
'https_enabled': getattr(settings, 'SECURE_SSL_REDIRECT', False),
|
|
'hsts_enabled': getattr(settings, 'SECURE_HSTS_SECONDS', 0) > 0,
|
|
'csrf_protection': True, # Django default
|
|
'session_secure': getattr(settings, 'SESSION_COOKIE_SECURE', False),
|
|
}
|
|
|
|
failed_checks = [check for check, passed in security_checks.items() if not passed]
|
|
|
|
status = 'healthy'
|
|
message = 'Security configuration is healthy'
|
|
|
|
if failed_checks:
|
|
status = 'warning'
|
|
message = f'Security issues detected: {", ".join(failed_checks)}'
|
|
|
|
return {
|
|
'status': status,
|
|
'message': message,
|
|
'security_checks': security_checks,
|
|
'failed_checks': failed_checks,
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
except Exception as e:
|
|
return {
|
|
'status': 'unhealthy',
|
|
'message': f'Security check failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}
|
|
|
|
|
|
def health_check_view(request):
|
|
"""Django view for health check endpoint"""
|
|
try:
|
|
service = HealthCheckService()
|
|
checks = request.GET.getlist('checks')
|
|
result = service.perform_health_check(checks if checks else None)
|
|
|
|
# Determine HTTP status code
|
|
if result['status'] == 'healthy':
|
|
status_code = 200
|
|
elif result['status'] == 'degraded':
|
|
status_code = 200 # Still operational
|
|
else:
|
|
status_code = 503 # Service unavailable
|
|
|
|
return JsonResponse(result, status=status_code)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Health check view failed: {str(e)}")
|
|
return JsonResponse({
|
|
'status': 'unhealthy',
|
|
'message': f'Health check failed: {str(e)}',
|
|
'error': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}, status=503)
|
|
|
|
|
|
def readiness_check_view(request):
|
|
"""Django view for readiness check (simplified health check)"""
|
|
try:
|
|
# Quick checks for readiness
|
|
with connection.cursor() as cursor:
|
|
cursor.execute("SELECT 1")
|
|
|
|
cache.set('readiness_check', 'ok', 10)
|
|
cache.get('readiness_check')
|
|
|
|
return JsonResponse({
|
|
'status': 'ready',
|
|
'timestamp': timezone.now().isoformat(),
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Readiness check failed: {str(e)}")
|
|
return JsonResponse({
|
|
'status': 'not_ready',
|
|
'message': str(e),
|
|
'timestamp': timezone.now().isoformat(),
|
|
}, status=503)
|
|
|
|
|
|
def liveness_check_view(request):
|
|
"""Django view for liveness check (basic application check)"""
|
|
return JsonResponse({
|
|
'status': 'alive',
|
|
'timestamp': timezone.now().isoformat(),
|
|
})
|