This commit is contained in:
Iliyan Angelov
2025-09-19 11:58:53 +03:00
parent 306b20e24a
commit 6b247e5b9f
11423 changed files with 1500615 additions and 778 deletions

View File

@@ -0,0 +1,958 @@
# ETB-API Enterprise Deployment Guide
## 🚀 **Enterprise-Grade Deployment for Production**
This comprehensive guide provides step-by-step instructions for deploying the ETB-API platform in an enterprise environment with high availability, security, and scalability.
## 📋 **Table of Contents**
1. [Prerequisites](#prerequisites)
2. [Infrastructure Setup](#infrastructure-setup)
3. [Database Configuration](#database-configuration)
4. [Application Deployment](#application-deployment)
5. [Security Configuration](#security-configuration)
6. [Monitoring & Observability](#monitoring--observability)
7. [Backup & Recovery](#backup--recovery)
8. [High Availability](#high-availability)
9. [Performance Optimization](#performance-optimization)
10. [Maintenance & Operations](#maintenance--operations)
## 🔧 **Prerequisites**
### System Requirements
- **Operating System**: Ubuntu 20.04 LTS or CentOS 8+
- **CPU**: 8+ cores (16+ recommended for production)
- **RAM**: 32GB+ (64GB+ recommended for production)
- **Storage**: 500GB+ SSD (1TB+ recommended for production)
- **Network**: 1Gbps+ bandwidth
### Software Requirements
- **Python**: 3.9+
- **PostgreSQL**: 13+
- **Redis**: 6.2+
- **Nginx**: 1.18+
- **Docker**: 20.10+ (optional)
- **Kubernetes**: 1.21+ (optional)
### Dependencies
```bash
# Install system packages
sudo apt-get update
sudo apt-get install -y python3.9 python3.9-dev python3-pip
sudo apt-get install -y postgresql-13 postgresql-client-13
sudo apt-get install -y redis-server nginx
sudo apt-get install -y git curl wget unzip
# Install Python dependencies
pip3 install -r requirements.txt
```
## 🏗️ **Infrastructure Setup**
### 1. Database Cluster Setup
#### PostgreSQL Primary-Replica Configuration
```bash
# Primary Database Server
sudo -u postgres psql
CREATE DATABASE etb_incident_management;
CREATE USER etb_user WITH PASSWORD 'secure_password';
GRANT ALL PRIVILEGES ON DATABASE etb_incident_management TO etb_user;
# Configure replication
sudo nano /etc/postgresql/13/main/postgresql.conf
```
```conf
# postgresql.conf
listen_addresses = '*'
wal_level = replica
max_wal_senders = 3
max_replication_slots = 3
hot_standby = on
```
```bash
# Configure authentication
sudo nano /etc/postgresql/13/main/pg_hba.conf
```
```conf
# pg_hba.conf
host replication replicator 10.0.0.0/8 md5
host all all 10.0.0.0/8 md5
```
#### Redis Cluster Setup
```bash
# Redis Master
sudo nano /etc/redis/redis.conf
```
```conf
# redis.conf
bind 0.0.0.0
port 6379
requirepass secure_redis_password
maxmemory 2gb
maxmemory-policy allkeys-lru
save 900 1
save 300 10
save 60 10000
```
### 2. Load Balancer Configuration
#### Nginx Load Balancer
```bash
sudo nano /etc/nginx/sites-available/etb-api
```
```nginx
upstream etb_api_backend {
least_conn;
server 10.0.1.10:8000 max_fails=3 fail_timeout=30s;
server 10.0.1.11:8000 max_fails=3 fail_timeout=30s;
server 10.0.1.12:8000 max_fails=3 fail_timeout=30s;
}
upstream etb_api_websocket {
ip_hash;
server 10.0.1.10:8001;
server 10.0.1.11:8001;
server 10.0.1.12:8001;
}
server {
listen 80;
server_name api.yourcompany.com;
return 301 https://$server_name$request_uri;
}
server {
listen 443 ssl http2;
server_name api.yourcompany.com;
# SSL Configuration
ssl_certificate /etc/ssl/certs/etb-api.crt;
ssl_certificate_key /etc/ssl/private/etb-api.key;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-RSA-AES256-GCM-SHA512:DHE-RSA-AES256-GCM-SHA512;
ssl_prefer_server_ciphers off;
ssl_session_cache shared:SSL:10m;
ssl_session_timeout 10m;
# Security Headers
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
add_header X-Content-Type-Options nosniff;
add_header X-Frame-Options DENY;
add_header X-XSS-Protection "1; mode=block";
add_header Content-Security-Policy "default-src 'self'";
# Rate Limiting
limit_req_zone $binary_remote_addr zone=api:10m rate=10r/s;
limit_req zone=api burst=20 nodelay;
# API Routes
location /api/ {
proxy_pass http://etb_api_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_connect_timeout 30s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
}
# WebSocket Routes
location /ws/ {
proxy_pass http://etb_api_websocket;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# Static Files
location /static/ {
alias /var/www/etb-api/static/;
expires 1y;
add_header Cache-Control "public, immutable";
}
# Media Files
location /media/ {
alias /var/www/etb-api/media/;
expires 1y;
add_header Cache-Control "public";
}
# Health Checks
location /health/ {
proxy_pass http://etb_api_backend;
access_log off;
}
}
```
## 🗄️ **Database Configuration**
### 1. Database Optimization
```sql
-- PostgreSQL Performance Tuning
ALTER SYSTEM SET shared_buffers = '4GB';
ALTER SYSTEM SET effective_cache_size = '12GB';
ALTER SYSTEM SET maintenance_work_mem = '1GB';
ALTER SYSTEM SET checkpoint_completion_target = 0.9;
ALTER SYSTEM SET wal_buffers = '16MB';
ALTER SYSTEM SET default_statistics_target = 100;
ALTER SYSTEM SET random_page_cost = 1.1;
ALTER SYSTEM SET effective_io_concurrency = 200;
-- Restart PostgreSQL
sudo systemctl restart postgresql
```
### 2. Database Indexes
```sql
-- Create performance indexes
CREATE INDEX CONCURRENTLY idx_incident_status_priority ON incident_intelligence_incident(status, priority);
CREATE INDEX CONCURRENTLY idx_incident_created_at ON incident_intelligence_incident(created_at);
CREATE INDEX CONCURRENTLY idx_sla_instance_status ON sla_oncall_slainstance(status);
CREATE INDEX CONCURRENTLY idx_security_event_timestamp ON security_securityevent(timestamp);
CREATE INDEX CONCURRENTLY idx_monitoring_metric_timestamp ON monitoring_metric(timestamp);
-- Create partial indexes for active records
CREATE INDEX CONCURRENTLY idx_incident_active ON incident_intelligence_incident(id) WHERE status = 'active';
CREATE INDEX CONCURRENTLY idx_sla_active ON sla_oncall_slainstance(id) WHERE status = 'active';
```
## 🚀 **Application Deployment**
### 1. Environment Configuration
```bash
# Create environment file
sudo nano /etc/etb-api/.env
```
```env
# Database Configuration
DB_NAME=etb_incident_management
DB_USER=etb_user
DB_PASSWORD=secure_password
DB_HOST=10.0.1.5
DB_PORT=5432
# Redis Configuration
REDIS_URL=redis://:secure_redis_password@10.0.1.6:6379/0
CELERY_BROKER_URL=redis://:secure_redis_password@10.0.1.6:6379/0
CELERY_RESULT_BACKEND=redis://:secure_redis_password@10.0.1.6:6379/0
# Security
SECRET_KEY=your-super-secret-key-here
DEBUG=False
ALLOWED_HOSTS=api.yourcompany.com,10.0.1.10,10.0.1.11,10.0.1.12
# Email Configuration
EMAIL_HOST=smtp.yourcompany.com
EMAIL_PORT=587
EMAIL_USE_TLS=True
EMAIL_HOST_USER=noreply@yourcompany.com
EMAIL_HOST_PASSWORD=email_password
DEFAULT_FROM_EMAIL=noreply@yourcompany.com
# Monitoring
PROMETHEUS_ENABLED=True
GRAFANA_ENABLED=True
ELASTICSEARCH_URL=http://10.0.1.8:9200
# Backup
BACKUP_ENABLED=True
BACKUP_RETENTION_DAYS=30
AWS_S3_BACKUP_BUCKET=etb-api-backups
AWS_ACCESS_KEY_ID=your-access-key
AWS_SECRET_ACCESS_KEY=your-secret-key
AWS_REGION=us-east-1
# Security
SIEM_WEBHOOK_URL=https://siem.yourcompany.com/webhook
SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
ALERT_WEBHOOK_URL=https://alerts.yourcompany.com/webhook
```
### 2. Application Deployment Script
```bash
#!/bin/bash
# deploy.sh - Enterprise deployment script
set -e
# Configuration
APP_NAME="etb-api"
APP_USER="etb"
APP_DIR="/var/www/etb-api"
VENV_DIR="/var/www/etb-api/venv"
LOG_DIR="/var/log/etb-api"
BACKUP_DIR="/backups/etb-api"
# Create application user
sudo useradd -r -s /bin/false $APP_USER || true
# Create directories
sudo mkdir -p $APP_DIR $LOG_DIR $BACKUP_DIR
sudo chown -R $APP_USER:$APP_USER $APP_DIR $LOG_DIR $BACKUP_DIR
# Clone repository
cd $APP_DIR
sudo -u $APP_USER git clone https://github.com/yourcompany/etb-api.git .
# Create virtual environment
sudo -u $APP_USER python3 -m venv $VENV_DIR
sudo -u $APP_USER $VENV_DIR/bin/pip install --upgrade pip
sudo -u $APP_USER $VENV_DIR/bin/pip install -r requirements.txt
# Set up environment
sudo -u $APP_USER cp .env.example .env
sudo -u $APP_USER nano .env # Configure environment variables
# Run migrations
sudo -u $APP_USER $VENV_DIR/bin/python manage.py migrate
# Collect static files
sudo -u $APP_USER $VENV_DIR/bin/python manage.py collectstatic --noinput
# Create superuser
sudo -u $APP_USER $VENV_DIR/bin/python manage.py createsuperuser
# Set up systemd services
sudo cp deployment/systemd/*.service /etc/systemd/system/
sudo systemctl daemon-reload
sudo systemctl enable etb-api etb-celery etb-celery-beat
sudo systemctl start etb-api etb-celery etb-celery-beat
# Set up log rotation
sudo cp deployment/logrotate/etb-api /etc/logrotate.d/
sudo chmod 644 /etc/logrotate.d/etb-api
# Set up monitoring
sudo -u $APP_USER $VENV_DIR/bin/python manage.py setup_monitoring
echo "Deployment completed successfully!"
```
### 3. Systemd Services
#### ETB-API Service
```ini
# /etc/systemd/system/etb-api.service
[Unit]
Description=ETB-API Django Application
After=network.target postgresql.service redis.service
Requires=postgresql.service redis.service
[Service]
Type=exec
User=etb
Group=etb
WorkingDirectory=/var/www/etb-api
Environment=PATH=/var/www/etb-api/venv/bin
EnvironmentFile=/etc/etb-api/.env
ExecStart=/var/www/etb-api/venv/bin/gunicorn --bind 0.0.0.0:8000 --workers 4 --worker-class gevent --worker-connections 1000 --max-requests 1000 --max-requests-jitter 100 --timeout 30 --keep-alive 2 --preload core.wsgi:application
ExecReload=/bin/kill -s HUP $MAINPID
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=etb-api
[Install]
WantedBy=multi-user.target
```
#### Celery Worker Service
```ini
# /etc/systemd/system/etb-celery.service
[Unit]
Description=ETB-API Celery Worker
After=network.target redis.service
Requires=redis.service
[Service]
Type=exec
User=etb
Group=etb
WorkingDirectory=/var/www/etb-api
Environment=PATH=/var/www/etb-api/venv/bin
EnvironmentFile=/etc/etb-api/.env
ExecStart=/var/www/etb-api/venv/bin/celery -A core worker -l info --concurrency=4 --max-tasks-per-child=1000
ExecReload=/bin/kill -s HUP $MAINPID
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=etb-celery
[Install]
WantedBy=multi-user.target
```
#### Celery Beat Service
```ini
# /etc/systemd/system/etb-celery-beat.service
[Unit]
Description=ETB-API Celery Beat Scheduler
After=network.target redis.service
Requires=redis.service
[Service]
Type=exec
User=etb
Group=etb
WorkingDirectory=/var/www/etb-api
Environment=PATH=/var/www/etb-api/venv/bin
EnvironmentFile=/etc/etb-api/.env
ExecStart=/var/www/etb-api/venv/bin/celery -A core beat -l info --scheduler django_celery_beat.schedulers:DatabaseScheduler
ExecReload=/bin/kill -s HUP $MAINPID
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
SyslogIdentifier=etb-celery-beat
[Install]
WantedBy=multi-user.target
```
## 🔒 **Security Configuration**
### 1. Firewall Configuration
```bash
# UFW Firewall Rules
sudo ufw default deny incoming
sudo ufw default allow outgoing
sudo ufw allow ssh
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
sudo ufw allow from 10.0.0.0/8 to any port 5432 # PostgreSQL
sudo ufw allow from 10.0.0.0/8 to any port 6379 # Redis
sudo ufw enable
```
### 2. SSL/TLS Configuration
```bash
# Generate SSL certificate
sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
-keyout /etc/ssl/private/etb-api.key \
-out /etc/ssl/certs/etb-api.crt \
-subj "/C=US/ST=State/L=City/O=Organization/CN=api.yourcompany.com"
# Set permissions
sudo chmod 600 /etc/ssl/private/etb-api.key
sudo chmod 644 /etc/ssl/certs/etb-api.crt
```
### 3. Security Hardening
```bash
# Disable unnecessary services
sudo systemctl disable apache2
sudo systemctl disable mysql
# Configure fail2ban
sudo apt-get install fail2ban
sudo cp /etc/fail2ban/jail.conf /etc/fail2ban/jail.local
# Add custom jail for ETB-API
sudo nano /etc/fail2ban/jail.d/etb-api.conf
```
```ini
[etb-api]
enabled = true
port = 80,443
filter = etb-api
logpath = /var/log/etb-api/application.log
maxretry = 5
bantime = 3600
findtime = 600
```
## 📊 **Monitoring & Observability**
### 1. Prometheus Configuration
```yaml
# /etc/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "etb-api-rules.yml"
scrape_configs:
- job_name: 'etb-api'
static_configs:
- targets: ['10.0.1.10:8000', '10.0.1.11:8000', '10.0.1.12:8000']
metrics_path: '/api/monitoring/metrics/'
scrape_interval: 30s
- job_name: 'postgresql'
static_configs:
- targets: ['10.0.1.5:9187']
- job_name: 'redis'
static_configs:
- targets: ['10.0.1.6:9121']
- job_name: 'nginx'
static_configs:
- targets: ['10.0.1.7:9113']
```
### 2. Grafana Dashboard
```json
{
"dashboard": {
"title": "ETB-API Enterprise Dashboard",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{endpoint}}"
}
]
},
{
"title": "Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
}
]
},
{
"title": "System Resources",
"type": "graph",
"targets": [
{
"expr": "system_cpu_usage_percent",
"legendFormat": "CPU Usage"
},
{
"expr": "system_memory_usage_percent",
"legendFormat": "Memory Usage"
}
]
}
]
}
}
```
### 3. Log Aggregation
```yaml
# /etc/filebeat/filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/etb-api/*.log
fields:
service: etb-api
fields_under_root: true
output.elasticsearch:
hosts: ["10.0.1.8:9200"]
index: "etb-api-%{+yyyy.MM.dd}"
processors:
- add_host_metadata:
when.not.contains.tags: forwarded
```
## 💾 **Backup & Recovery**
### 1. Automated Backup Script
```bash
#!/bin/bash
# backup.sh - Automated backup script
set -e
BACKUP_DIR="/backups/etb-api"
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_NAME="etb-api-backup-$DATE"
# Create backup directory
mkdir -p $BACKUP_DIR/$BACKUP_NAME
# Database backup
pg_dump -h 10.0.1.5 -U etb_user etb_incident_management > $BACKUP_DIR/$BACKUP_NAME/database.sql
# Application backup
tar -czf $BACKUP_DIR/$BACKUP_NAME/application.tar.gz -C /var/www etb-api
# Configuration backup
tar -czf $BACKUP_DIR/$BACKUP_NAME/config.tar.gz /etc/etb-api /etc/nginx/sites-available/etb-api
# Create backup manifest
cat > $BACKUP_DIR/$BACKUP_NAME/manifest.json << EOF
{
"backup_name": "$BACKUP_NAME",
"created_at": "$(date -Iseconds)",
"components": {
"database": "database.sql",
"application": "application.tar.gz",
"configuration": "config.tar.gz"
},
"size": "$(du -sh $BACKUP_DIR/$BACKUP_NAME | cut -f1)"
}
EOF
# Compress backup
tar -czf $BACKUP_DIR/$BACKUP_NAME.tar.gz -C $BACKUP_DIR $BACKUP_NAME
rm -rf $BACKUP_DIR/$BACKUP_NAME
# Upload to S3
aws s3 cp $BACKUP_DIR/$BACKUP_NAME.tar.gz s3://etb-api-backups/
# Cleanup old backups
find $BACKUP_DIR -name "*.tar.gz" -mtime +30 -delete
echo "Backup completed: $BACKUP_NAME"
```
### 2. Recovery Script
```bash
#!/bin/bash
# restore.sh - Recovery script
set -e
BACKUP_NAME=$1
BACKUP_DIR="/backups/etb-api"
if [ -z "$BACKUP_NAME" ]; then
echo "Usage: $0 <backup_name>"
exit 1
fi
# Download from S3
aws s3 cp s3://etb-api-backups/$BACKUP_NAME.tar.gz $BACKUP_DIR/
# Extract backup
tar -xzf $BACKUP_DIR/$BACKUP_NAME.tar.gz -C $BACKUP_DIR
# Stop services
sudo systemctl stop etb-api etb-celery etb-celery-beat
# Restore database
psql -h 10.0.1.5 -U etb_user etb_incident_management < $BACKUP_DIR/$BACKUP_NAME/database.sql
# Restore application
tar -xzf $BACKUP_DIR/$BACKUP_NAME/application.tar.gz -C /var/www
# Restore configuration
tar -xzf $BACKUP_DIR/$BACKUP_NAME/config.tar.gz -C /
# Start services
sudo systemctl start etb-api etb-celery etb-celery-beat
echo "Recovery completed: $BACKUP_NAME"
```
## 🔄 **High Availability**
### 1. Load Balancer Health Checks
```nginx
# Health check configuration
upstream etb_api_backend {
least_conn;
server 10.0.1.10:8000 max_fails=3 fail_timeout=30s;
server 10.0.1.11:8000 max_fails=3 fail_timeout=30s;
server 10.0.1.12:8000 max_fails=3 fail_timeout=30s;
}
# Health check endpoint
location /health/ {
proxy_pass http://etb_api_backend;
proxy_connect_timeout 5s;
proxy_send_timeout 5s;
proxy_read_timeout 5s;
access_log off;
}
```
### 2. Database Failover
```bash
# PostgreSQL failover script
#!/bin/bash
# failover.sh - Database failover script
PRIMARY_HOST="10.0.1.5"
STANDBY_HOST="10.0.1.6"
# Check primary health
if ! pg_isready -h $PRIMARY_HOST -p 5432; then
echo "Primary database is down, initiating failover..."
# Promote standby
ssh $STANDBY_HOST "sudo -u postgres pg_ctl promote -D /var/lib/postgresql/13/main"
# Update application configuration
sed -i "s/DB_HOST=$PRIMARY_HOST/DB_HOST=$STANDBY_HOST/" /etc/etb-api/.env
# Restart application
sudo systemctl restart etb-api
echo "Failover completed to $STANDBY_HOST"
fi
```
## ⚡ **Performance Optimization**
### 1. Application Optimization
```python
# settings.py optimizations
CACHES = {
'default': {
'BACKEND': 'django_redis.cache.RedisCache',
'LOCATION': 'redis://10.0.1.6:6379/1',
'OPTIONS': {
'CLIENT_CLASS': 'django_redis.client.DefaultClient',
'CONNECTION_POOL_KWARGS': {
'max_connections': 50,
'retry_on_timeout': True,
},
'COMPRESSOR': 'django_redis.compressors.zlib.ZlibCompressor',
}
}
}
# Database connection pooling
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': 'etb_incident_management',
'USER': 'etb_user',
'PASSWORD': 'secure_password',
'HOST': '10.0.1.5',
'PORT': '5432',
'CONN_MAX_AGE': 600,
'CONN_HEALTH_CHECKS': True,
'OPTIONS': {
'MAX_CONNS': 20,
'MIN_CONNS': 5,
}
}
}
```
### 2. Nginx Optimization
```nginx
# nginx.conf optimizations
worker_processes auto;
worker_cpu_affinity auto;
worker_rlimit_nofile 65535;
events {
worker_connections 4096;
use epoll;
multi_accept on;
}
http {
# Gzip compression
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript;
# Buffer sizes
client_body_buffer_size 128k;
client_max_body_size 10m;
client_header_buffer_size 1k;
large_client_header_buffers 4 4k;
# Timeouts
client_body_timeout 12;
client_header_timeout 12;
keepalive_timeout 15;
send_timeout 10;
}
```
## 🔧 **Maintenance & Operations**
### 1. Monitoring Scripts
```bash
#!/bin/bash
# health_check.sh - Comprehensive health check
# Check application health
curl -f http://localhost:8000/health/ || exit 1
# Check database connectivity
pg_isready -h 10.0.1.5 -p 5432 || exit 1
# Check Redis connectivity
redis-cli -h 10.0.1.6 -p 6379 ping || exit 1
# Check disk space
df -h | awk '$5 > 90 {print $0; exit 1}'
# Check memory usage
free | awk 'NR==2{printf "%.2f%%\n", $3*100/$2}' | awk '{if($1 > 90) exit 1}'
echo "All health checks passed"
```
### 2. Log Rotation
```bash
# /etc/logrotate.d/etb-api
/var/log/etb-api/*.log {
daily
missingok
rotate 30
compress
delaycompress
notifempty
create 644 etb etb
postrotate
systemctl reload etb-api
endscript
}
```
### 3. Update Script
```bash
#!/bin/bash
# update.sh - Application update script
set -e
# Backup current version
./backup.sh
# Pull latest changes
cd /var/www/etb-api
sudo -u etb git pull origin main
# Update dependencies
sudo -u etb /var/www/etb-api/venv/bin/pip install -r requirements.txt
# Run migrations
sudo -u etb /var/www/etb-api/venv/bin/python manage.py migrate
# Collect static files
sudo -u etb /var/www/etb-api/venv/bin/python manage.py collectstatic --noinput
# Restart services
sudo systemctl restart etb-api etb-celery etb-celery-beat
echo "Update completed successfully"
```
## 📈 **Scaling Guidelines**
### 1. Horizontal Scaling
- **Application Servers**: Add more instances behind load balancer
- **Database**: Implement read replicas for read-heavy workloads
- **Cache**: Use Redis Cluster for distributed caching
- **Storage**: Implement distributed file storage (S3, Ceph)
### 2. Vertical Scaling
- **CPU**: Increase cores for compute-intensive operations
- **Memory**: Add RAM for caching and in-memory operations
- **Storage**: Use SSD for better I/O performance
- **Network**: Upgrade bandwidth for high-traffic scenarios
## 🚨 **Troubleshooting**
### Common Issues
1. **High Memory Usage**
```bash
# Check memory usage
free -h
ps aux --sort=-%mem | head
# Restart services if needed
sudo systemctl restart etb-api
```
2. **Database Connection Issues**
```bash
# Check database status
sudo systemctl status postgresql
pg_isready -h 10.0.1.5 -p 5432
# Check connection pool
psql -h 10.0.1.5 -U etb_user -c "SELECT * FROM pg_stat_activity;"
```
3. **Cache Issues**
```bash
# Check Redis status
sudo systemctl status redis
redis-cli -h 10.0.1.6 -p 6379 ping
# Clear cache if needed
redis-cli -h 10.0.1.6 -p 6379 FLUSHALL
```
## 📞 **Support & Maintenance**
### Regular Tasks
- **Daily**: Monitor system health and alerts
- **Weekly**: Review performance metrics and logs
- **Monthly**: Update dependencies and security patches
- **Quarterly**: Review and optimize performance
### Emergency Procedures
1. **Service Outage**: Check health endpoints and restart services
2. **Database Issues**: Check connectivity and failover if needed
3. **Security Incident**: Review logs and implement containment
4. **Performance Degradation**: Analyze metrics and scale resources
This comprehensive deployment guide provides enterprise-grade setup for the ETB-API platform. Adjust configurations based on your specific requirements and infrastructure.

View File

@@ -0,0 +1,495 @@
# ETB-API Monitoring System Deployment Guide
## Overview
This guide provides step-by-step instructions for deploying the comprehensive monitoring system for your ETB-API platform. The monitoring system provides enterprise-grade observability across all modules.
## Prerequisites
### System Requirements
- Python 3.8+
- Django 5.2+
- PostgreSQL 12+ (recommended) or SQLite (development)
- Redis 6+ (for Celery)
- Celery 5.3+
### Dependencies
- psutil>=5.9.0
- requests>=2.31.0
- celery>=5.3.0
- redis>=4.5.0
## Installation Steps
### 1. Install Dependencies
```bash
# Install Python dependencies
pip install psutil>=5.9.0 requests>=2.31.0
# Install Redis (Ubuntu/Debian)
sudo apt-get install redis-server
# Install Redis (CentOS/RHEL)
sudo yum install redis
```
### 2. Database Setup
```bash
# Create and run migrations
python manage.py makemigrations monitoring
python manage.py migrate
# Create superuser (if not exists)
python manage.py createsuperuser
```
### 3. Initialize Monitoring Configuration
```bash
# Set up default monitoring targets, metrics, and alert rules
python manage.py setup_monitoring --admin-user admin
```
### 4. Configure Celery
Create or update `celery.py` in your project:
```python
from celery import Celery
from django.conf import settings
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
app = Celery('core')
app.config_from_object('django.conf:settings', namespace='CELERY')
# Add monitoring tasks schedule
app.conf.beat_schedule = {
'health-checks': {
'task': 'monitoring.tasks.execute_health_checks',
'schedule': 60.0, # Every minute
},
'metrics-collection': {
'task': 'monitoring.tasks.collect_metrics',
'schedule': 300.0, # Every 5 minutes
},
'alert-evaluation': {
'task': 'monitoring.tasks.evaluate_alerts',
'schedule': 60.0, # Every minute
},
'data-cleanup': {
'task': 'monitoring.tasks.cleanup_old_data',
'schedule': 86400.0, # Daily
},
'system-status-report': {
'task': 'monitoring.tasks.generate_system_status_report',
'schedule': 300.0, # Every 5 minutes
},
}
app.autodiscover_tasks()
```
### 5. Environment Configuration
Create `.env` file or set environment variables:
```bash
# Monitoring Settings
MONITORING_ENABLED=true
MONITORING_HEALTH_CHECK_INTERVAL=60
MONITORING_METRICS_COLLECTION_INTERVAL=300
MONITORING_ALERT_EVALUATION_INTERVAL=60
# Alerting Settings
ALERTING_EMAIL_FROM=monitoring@yourcompany.com
ALERTING_SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK
ALERTING_WEBHOOK_URL=https://your-webhook-url.com/alerts
# Performance Thresholds
PERFORMANCE_API_RESPONSE_THRESHOLD=2000
PERFORMANCE_CPU_THRESHOLD=80
PERFORMANCE_MEMORY_THRESHOLD=80
PERFORMANCE_DISK_THRESHOLD=80
# Email Configuration (for alerts)
EMAIL_HOST=smtp.gmail.com
EMAIL_PORT=587
EMAIL_USE_TLS=True
EMAIL_HOST_USER=your-email@gmail.com
EMAIL_HOST_PASSWORD=your-app-password
DEFAULT_FROM_EMAIL=monitoring@yourcompany.com
```
### 6. Start Services
```bash
# Start Django development server
python manage.py runserver
# Start Celery worker (in separate terminal)
celery -A core worker -l info
# Start Celery beat scheduler (in separate terminal)
celery -A core beat -l info
# Start Redis (if not running as service)
redis-server
```
## Production Deployment
### 1. Database Configuration
For production, use PostgreSQL:
```python
# settings.py
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': 'etb_api_monitoring',
'USER': 'monitoring_user',
'PASSWORD': 'secure_password',
'HOST': 'localhost',
'PORT': '5432',
}
}
```
### 2. Redis Configuration
```python
# settings.py
CELERY_BROKER_URL = 'redis://localhost:6379/0'
CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
```
### 3. Static Files and Media
```bash
# Collect static files
python manage.py collectstatic
# Configure web server (Nginx example)
server {
listen 80;
server_name your-domain.com;
location /static/ {
alias /path/to/your/static/files/;
}
location /media/ {
alias /path/to/your/media/files/;
}
location / {
proxy_pass http://127.0.0.1:8000;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
}
}
```
### 4. Process Management
Use systemd services for production:
**Django Service** (`/etc/systemd/system/etb-api.service`):
```ini
[Unit]
Description=ETB-API Django Application
After=network.target
[Service]
Type=simple
User=www-data
Group=www-data
WorkingDirectory=/path/to/etb-api
Environment=PATH=/path/to/etb-api/venv/bin
ExecStart=/path/to/etb-api/venv/bin/python manage.py runserver 0.0.0.0:8000
Restart=always
[Install]
WantedBy=multi-user.target
```
**Celery Worker Service** (`/etc/systemd/system/etb-celery.service`):
```ini
[Unit]
Description=ETB-API Celery Worker
After=network.target
[Service]
Type=simple
User=www-data
Group=www-data
WorkingDirectory=/path/to/etb-api
Environment=PATH=/path/to/etb-api/venv/bin
ExecStart=/path/to/etb-api/venv/bin/celery -A core worker -l info
Restart=always
[Install]
WantedBy=multi-user.target
```
**Celery Beat Service** (`/etc/systemd/system/etb-celery-beat.service`):
```ini
[Unit]
Description=ETB-API Celery Beat Scheduler
After=network.target
[Service]
Type=simple
User=www-data
Group=www-data
WorkingDirectory=/path/to/etb-api
Environment=PATH=/path/to/etb-api/venv/bin
ExecStart=/path/to/etb-api/venv/bin/celery -A core beat -l info
Restart=always
[Install]
WantedBy=multi-user.target
```
### 5. Enable Services
```bash
# Enable and start services
sudo systemctl enable etb-api
sudo systemctl enable etb-celery
sudo systemctl enable etb-celery-beat
sudo systemctl enable redis
sudo systemctl start etb-api
sudo systemctl start etb-celery
sudo systemctl start etb-celery-beat
sudo systemctl start redis
```
## Monitoring Configuration
### 1. Customize Monitoring Targets
Access the admin interface at `http://your-domain.com/admin/monitoring/` to:
- Add custom monitoring targets
- Configure health check intervals
- Set up external service monitoring
- Customize alert thresholds
### 2. Configure Alert Rules
Create alert rules for:
- **Performance Alerts**: High response times, error rates
- **Business Alerts**: SLA breaches, incident volume spikes
- **Security Alerts**: Failed logins, security events
- **Infrastructure Alerts**: High CPU, memory, disk usage
### 3. Set Up Notification Channels
Configure notification channels:
- **Email**: Set up SMTP configuration
- **Slack**: Configure webhook URLs
- **Webhooks**: Set up external alerting systems
- **PagerDuty**: Integrate with incident management
### 4. Create Custom Dashboards
Design dashboards for different user roles:
- **Executive Dashboard**: High-level KPIs and trends
- **Operations Dashboard**: Real-time system status
- **Security Dashboard**: Security metrics and alerts
- **Development Dashboard**: Application performance metrics
## Verification
### 1. Check System Health
```bash
# Check health check summary
curl -H "Authorization: Token your-token" \
http://localhost:8000/api/monitoring/health-checks/summary/
# Check system overview
curl -H "Authorization: Token your-token" \
http://localhost:8000/api/monitoring/overview/
```
### 2. Verify Celery Tasks
```bash
# Check Celery worker status
celery -A core inspect active
# Check scheduled tasks
celery -A core inspect scheduled
```
### 3. Test Alerting
```bash
# Trigger a test alert
python manage.py shell
>>> from monitoring.models import AlertRule
>>> rule = AlertRule.objects.first()
>>> # Manually trigger alert for testing
```
## Maintenance
### 1. Data Cleanup
The system automatically cleans up old data, but you can manually run:
```bash
python manage.py shell
>>> from monitoring.tasks import cleanup_old_data
>>> cleanup_old_data.delay()
```
### 2. Performance Tuning
Monitor and tune:
- Health check intervals
- Metrics collection frequency
- Alert evaluation intervals
- Data retention periods
### 3. Scaling
For high-volume environments:
- Use multiple Celery workers
- Implement Redis clustering
- Use database read replicas
- Consider time-series databases for metrics
## Troubleshooting
### Common Issues
1. **Health Checks Failing**
```bash
# Check logs
tail -f /var/log/etb-api.log
# Test individual targets
python manage.py shell
>>> from monitoring.services.health_checks import HealthCheckService
>>> service = HealthCheckService()
>>> service.execute_all_health_checks()
```
2. **Celery Tasks Not Running**
```bash
# Check Celery status
celery -A core inspect active
# Check Redis connection
redis-cli ping
# Restart services
sudo systemctl restart etb-celery
sudo systemctl restart etb-celery-beat
```
3. **Alerts Not Sending**
```bash
# Check email configuration
python manage.py shell
>>> from django.core.mail import send_mail
>>> send_mail('Test', 'Test message', 'from@example.com', ['to@example.com'])
# Check Slack webhook
curl -X POST -H 'Content-type: application/json' \
--data '{"text":"Test message"}' \
YOUR_SLACK_WEBHOOK_URL
```
### Log Locations
- Django logs: `/var/log/etb-api.log`
- Celery logs: `/var/log/celery.log`
- Nginx logs: `/var/log/nginx/`
- System logs: `/var/log/syslog`
## Security Considerations
### 1. Authentication
- Use strong authentication tokens
- Implement token rotation
- Use HTTPS in production
- Restrict admin access
### 2. Data Protection
- Encrypt sensitive configuration data
- Use secure database connections
- Implement data retention policies
- Regular security audits
### 3. Network Security
- Use firewalls to restrict access
- Implement rate limiting
- Monitor for suspicious activity
- Regular security updates
## Backup and Recovery
### 1. Database Backup
```bash
# PostgreSQL backup
pg_dump etb_api_monitoring > backup_$(date +%Y%m%d_%H%M%S).sql
# Automated backup script
#!/bin/bash
BACKUP_DIR="/backups/monitoring"
DATE=$(date +%Y%m%d_%H%M%S)
pg_dump etb_api_monitoring > $BACKUP_DIR/backup_$DATE.sql
find $BACKUP_DIR -name "backup_*.sql" -mtime +7 -delete
```
### 2. Configuration Backup
```bash
# Backup configuration files
tar -czf monitoring_config_$(date +%Y%m%d).tar.gz \
/path/to/etb-api/core/settings.py \
/path/to/etb-api/.env \
/etc/systemd/system/etb-*.service
```
### 3. Recovery Procedures
1. Restore database from backup
2. Restore configuration files
3. Restart services
4. Verify monitoring functionality
5. Check alert rules and thresholds
## Support and Maintenance
### Regular Tasks
- **Daily**: Check system health and alerts
- **Weekly**: Review metrics trends and thresholds
- **Monthly**: Update monitoring configuration
- **Quarterly**: Review and optimize performance
### Monitoring the Monitor
- Set up external monitoring for the monitoring system
- Monitor Celery worker health
- Track database performance
- Monitor disk space usage
This deployment guide provides a comprehensive foundation for implementing enterprise-grade monitoring for your ETB-API system. Adjust configurations based on your specific requirements and infrastructure.

View File

@@ -0,0 +1,633 @@
# Analytics & Predictive Insights API Documentation
## Overview
The Analytics & Predictive Insights module provides comprehensive analytics capabilities for incident management, including advanced KPIs, predictive analytics, ML-based anomaly detection, and cost impact analysis.
## Features
- **Advanced KPIs**: MTTA, MTTR, incident recurrence rate, availability metrics
- **Predictive Analytics**: ML-based incident prediction, severity prediction, resolution time prediction
- **Anomaly Detection**: Statistical, temporal, and pattern-based anomaly detection
- **Cost Analysis**: Downtime cost, lost revenue, penalty cost analysis
- **Dashboards**: Configurable dashboards with heatmaps and visualizations
- **Heatmaps**: Time-based incident frequency, resolution time, and cost impact visualizations
## API Endpoints
### Base URL
```
/api/analytics/
```
## KPI Metrics
### List KPI Metrics
```http
GET /api/analytics/kpi-metrics/
```
**Query Parameters:**
- `metric_type`: Filter by metric type (MTTA, MTTR, INCIDENT_COUNT, etc.)
- `is_active`: Filter by active status (true/false)
- `is_system_metric`: Filter by system metric status (true/false)
- `created_after`: Filter by creation date (ISO 8601)
- `created_before`: Filter by creation date (ISO 8601)
**Response:**
```json
{
"count": 10,
"next": null,
"previous": null,
"results": [
{
"id": "uuid",
"name": "Mean Time to Acknowledge",
"description": "Average time to acknowledge incidents",
"metric_type": "MTTA",
"aggregation_type": "AVERAGE",
"incident_categories": ["Infrastructure", "Application"],
"incident_severities": ["HIGH", "CRITICAL"],
"incident_priorities": ["P1", "P2"],
"calculation_formula": null,
"time_window_hours": 24,
"is_active": true,
"is_system_metric": true,
"created_by_username": "admin",
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-01T00:00:00Z",
"measurement_count": 100,
"latest_measurement": {
"value": "15.5",
"unit": "minutes",
"calculated_at": "2024-01-01T12:00:00Z",
"incident_count": 25
}
}
]
}
```
### Get KPI Metric Details
```http
GET /api/analytics/kpi-metrics/{id}/
```
### Create KPI Metric
```http
POST /api/analytics/kpi-metrics/
```
**Request Body:**
```json
{
"name": "Custom MTTR",
"description": "Custom Mean Time to Resolve metric",
"metric_type": "MTTR",
"aggregation_type": "AVERAGE",
"incident_categories": ["Infrastructure"],
"incident_severities": ["HIGH", "CRITICAL"],
"time_window_hours": 48,
"is_active": true
}
```
### Get KPI Measurements
```http
GET /api/analytics/kpi-metrics/{id}/measurements/
```
**Query Parameters:**
- `start_date`: Filter by measurement period start (ISO 8601)
- `end_date`: Filter by measurement period end (ISO 8601)
### Get KPI Summary
```http
GET /api/analytics/kpi-metrics/summary/
```
**Response:**
```json
[
{
"metric_type": "MTTA",
"metric_name": "Mean Time to Acknowledge",
"current_value": "15.5",
"unit": "minutes",
"trend": "down",
"trend_percentage": "-5.2",
"period_start": "2024-01-01T00:00:00Z",
"period_end": "2024-01-01T24:00:00Z",
"incident_count": 25,
"target_value": null,
"target_met": true
}
]
```
## KPI Measurements
### List KPI Measurements
```http
GET /api/analytics/kpi-measurements/
```
**Query Parameters:**
- `metric_id`: Filter by metric ID
- `start_date`: Filter by measurement period start
- `end_date`: Filter by measurement period end
## Incident Recurrence Analysis
### List Recurrence Analyses
```http
GET /api/analytics/recurrence-analyses/
```
**Query Parameters:**
- `recurrence_type`: Filter by recurrence type
- `min_confidence`: Filter by minimum confidence score
- `is_resolved`: Filter by resolution status
### Get Unresolved Recurrence Analyses
```http
GET /api/analytics/recurrence-analyses/unresolved/
```
## Predictive Models
### List Predictive Models
```http
GET /api/analytics/predictive-models/
```
**Query Parameters:**
- `model_type`: Filter by model type
- `status`: Filter by model status
### Create Predictive Model
```http
POST /api/analytics/predictive-models/
```
**Request Body:**
```json
{
"name": "Incident Severity Predictor",
"description": "Predicts incident severity based on historical data",
"model_type": "SEVERITY_PREDICTION",
"algorithm_type": "RANDOM_FOREST",
"model_config": {
"n_estimators": 100,
"max_depth": 10
},
"feature_columns": ["title_length", "description_length", "category"],
"target_column": "severity",
"training_data_period_days": 90,
"min_training_samples": 100
}
```
### Train Model
```http
POST /api/analytics/predictive-models/{id}/train/
```
### Get Model Performance
```http
GET /api/analytics/predictive-models/{id}/performance/
```
## Anomaly Detection
### List Anomaly Detections
```http
GET /api/analytics/anomaly-detections/
```
**Query Parameters:**
- `anomaly_type`: Filter by anomaly type
- `severity`: Filter by severity level
- `status`: Filter by status
- `start_date`: Filter by detection date
- `end_date`: Filter by detection date
### Get Anomaly Summary
```http
GET /api/analytics/anomaly-detections/summary/
```
**Response:**
```json
{
"total_anomalies": 50,
"critical_anomalies": 5,
"high_anomalies": 15,
"medium_anomalies": 20,
"low_anomalies": 10,
"unresolved_anomalies": 12,
"false_positive_rate": "8.5",
"average_resolution_time": "2:30:00"
}
```
### Acknowledge Anomaly
```http
POST /api/analytics/anomaly-detections/{id}/acknowledge/
```
### Resolve Anomaly
```http
POST /api/analytics/anomaly-detections/{id}/resolve/
```
## Cost Impact Analysis
### List Cost Analyses
```http
GET /api/analytics/cost-analyses/
```
**Query Parameters:**
- `cost_type`: Filter by cost type
- `is_validated`: Filter by validation status
- `start_date`: Filter by creation date
- `end_date`: Filter by creation date
### Get Cost Summary
```http
GET /api/analytics/cost-analyses/summary/
```
**Response:**
```json
{
"total_cost": "125000.00",
"currency": "USD",
"downtime_cost": "75000.00",
"lost_revenue": "40000.00",
"penalty_cost": "10000.00",
"resource_cost": "0.00",
"total_downtime_hours": "150.5",
"total_affected_users": 5000,
"cost_per_hour": "830.56",
"cost_per_user": "25.00"
}
```
## Dashboard Configurations
### List Dashboard Configurations
```http
GET /api/analytics/dashboard-configurations/
```
**Query Parameters:**
- `dashboard_type`: Filter by dashboard type
- `is_active`: Filter by active status
### Create Dashboard Configuration
```http
POST /api/analytics/dashboard-configurations/
```
**Request Body:**
```json
{
"name": "Executive Dashboard",
"description": "High-level metrics for executives",
"dashboard_type": "EXECUTIVE",
"layout_config": {
"rows": 2,
"columns": 3
},
"widget_configs": [
{
"type": "kpi_summary",
"position": {"row": 0, "column": 0},
"size": {"width": 2, "height": 1}
},
{
"type": "anomaly_summary",
"position": {"row": 0, "column": 2},
"size": {"width": 1, "height": 1}
}
],
"is_public": false,
"allowed_roles": ["executive", "manager"],
"auto_refresh_enabled": true,
"refresh_interval_seconds": 300
}
```
### Get Dashboard Data
```http
GET /api/analytics/dashboard/{id}/data/
```
**Response:**
```json
{
"kpi_summary": [...],
"anomaly_summary": {...},
"cost_summary": {...},
"insight_summary": {...},
"recent_anomalies": [...],
"recent_insights": [...],
"heatmap_data": [...],
"last_updated": "2024-01-01T12:00:00Z"
}
```
## Heatmap Data
### List Heatmap Data
```http
GET /api/analytics/heatmap-data/
```
**Query Parameters:**
- `heatmap_type`: Filter by heatmap type
- `time_granularity`: Filter by time granularity
## Predictive Insights
### List Predictive Insights
```http
GET /api/analytics/predictive-insights/
```
**Query Parameters:**
- `insight_type`: Filter by insight type
- `confidence_level`: Filter by confidence level
- `is_acknowledged`: Filter by acknowledgment status
- `is_validated`: Filter by validation status
- `include_expired`: Include expired insights (true/false)
### Acknowledge Insight
```http
POST /api/analytics/predictive-insights/{id}/acknowledge/
```
### Get Insight Summary
```http
GET /api/analytics/predictive-insights/summary/
```
**Response:**
```json
{
"total_insights": 25,
"high_confidence_insights": 8,
"medium_confidence_insights": 12,
"low_confidence_insights": 5,
"acknowledged_insights": 15,
"validated_insights": 10,
"expired_insights": 3,
"average_accuracy": "0.85",
"active_models": 4
}
```
## Data Models
### KPI Metric
```json
{
"id": "uuid",
"name": "string",
"description": "string",
"metric_type": "MTTA|MTTR|MTBF|MTBSI|AVAILABILITY|INCIDENT_COUNT|RESOLUTION_RATE|ESCALATION_RATE|CUSTOM",
"aggregation_type": "AVERAGE|MEDIAN|MIN|MAX|SUM|COUNT|PERCENTILE_95|PERCENTILE_99",
"incident_categories": ["string"],
"incident_severities": ["string"],
"incident_priorities": ["string"],
"calculation_formula": "string",
"time_window_hours": "integer",
"is_active": "boolean",
"is_system_metric": "boolean",
"created_by": "uuid",
"created_at": "datetime",
"updated_at": "datetime"
}
```
### Predictive Model
```json
{
"id": "uuid",
"name": "string",
"description": "string",
"model_type": "ANOMALY_DETECTION|INCIDENT_PREDICTION|SEVERITY_PREDICTION|RESOLUTION_TIME_PREDICTION|ESCALATION_PREDICTION|COST_PREDICTION",
"algorithm_type": "ISOLATION_FOREST|LSTM|RANDOM_FOREST|XGBOOST|SVM|NEURAL_NETWORK|ARIMA|PROPHET",
"model_config": "object",
"feature_columns": ["string"],
"target_column": "string",
"training_data_period_days": "integer",
"min_training_samples": "integer",
"accuracy_score": "float",
"precision_score": "float",
"recall_score": "float",
"f1_score": "float",
"status": "TRAINING|ACTIVE|INACTIVE|RETRAINING|ERROR",
"version": "string",
"model_file_path": "string",
"last_trained_at": "datetime",
"training_duration_seconds": "integer",
"training_samples_count": "integer",
"auto_retrain_enabled": "boolean",
"retrain_frequency_days": "integer",
"performance_threshold": "float",
"created_by": "uuid",
"created_at": "datetime",
"updated_at": "datetime"
}
```
### Anomaly Detection
```json
{
"id": "uuid",
"model": "uuid",
"anomaly_type": "STATISTICAL|TEMPORAL|PATTERN|THRESHOLD|BEHAVIORAL",
"severity": "LOW|MEDIUM|HIGH|CRITICAL",
"status": "DETECTED|INVESTIGATING|CONFIRMED|FALSE_POSITIVE|RESOLVED",
"confidence_score": "float",
"anomaly_score": "float",
"threshold_used": "float",
"detected_at": "datetime",
"time_window_start": "datetime",
"time_window_end": "datetime",
"related_incidents": ["uuid"],
"affected_services": ["string"],
"affected_metrics": ["string"],
"description": "string",
"root_cause_analysis": "string",
"impact_assessment": "string",
"actions_taken": ["string"],
"resolved_at": "datetime",
"resolved_by": "uuid",
"metadata": "object"
}
```
### Cost Impact Analysis
```json
{
"id": "uuid",
"incident": "uuid",
"cost_type": "DOWNTIME|LOST_REVENUE|PENALTY|RESOURCE_COST|REPUTATION_COST|COMPLIANCE_COST",
"cost_amount": "decimal",
"currency": "string",
"calculation_method": "string",
"calculation_details": "object",
"downtime_hours": "decimal",
"affected_users": "integer",
"revenue_impact": "decimal",
"business_unit": "string",
"service_tier": "string",
"is_validated": "boolean",
"validated_by": "uuid",
"validated_at": "datetime",
"validation_notes": "string",
"created_at": "datetime",
"updated_at": "datetime"
}
```
## Management Commands
### Calculate KPIs
```bash
python manage.py calculate_kpis [--metric-id METRIC_ID] [--time-window HOURS] [--force]
```
### Run Anomaly Detection
```bash
python manage.py run_anomaly_detection [--model-id MODEL_ID] [--time-window HOURS]
```
### Train Predictive Models
```bash
python manage.py train_predictive_models [--model-id MODEL_ID] [--force]
```
## Error Handling
All endpoints return appropriate HTTP status codes and error messages:
- `400 Bad Request`: Invalid request data
- `401 Unauthorized`: Authentication required
- `403 Forbidden`: Insufficient permissions
- `404 Not Found`: Resource not found
- `500 Internal Server Error`: Server error
**Error Response Format:**
```json
{
"error": "Error message",
"details": "Additional error details",
"code": "ERROR_CODE"
}
```
## Authentication
All endpoints require authentication. Use one of the following methods:
1. **Token Authentication**: Include `Authorization: Token <token>` header
2. **Session Authentication**: Use Django session authentication
3. **SSO Authentication**: Use configured SSO providers
## Rate Limiting
API endpoints are rate-limited to prevent abuse:
- 1000 requests per hour per user
- 100 requests per minute per user
## Pagination
List endpoints support pagination:
- `page`: Page number (default: 1)
- `page_size`: Items per page (default: 20, max: 100)
## Filtering and Sorting
Most list endpoints support:
- **Filtering**: Use query parameters to filter results
- **Sorting**: Use `ordering` parameter (e.g., `ordering=-created_at`)
- **Search**: Use `search` parameter for text search
## Webhooks
The analytics module supports webhooks for real-time notifications:
- **Anomaly Detected**: Triggered when new anomalies are detected
- **KPI Threshold Breached**: Triggered when KPI values exceed thresholds
- **Model Training Completed**: Triggered when model training finishes
- **Cost Threshold Exceeded**: Triggered when cost impact exceeds thresholds
## Integration Examples
### Python Client Example
```python
import requests
# Get KPI summary
response = requests.get(
'https://api.example.com/api/analytics/kpi-metrics/summary/',
headers={'Authorization': 'Token your-token-here'}
)
kpi_summary = response.json()
# Create predictive model
model_data = {
'name': 'Incident Predictor',
'description': 'Predicts incident occurrence',
'model_type': 'INCIDENT_PREDICTION',
'algorithm_type': 'RANDOM_FOREST',
'model_config': {'n_estimators': 100}
}
response = requests.post(
'https://api.example.com/api/analytics/predictive-models/',
json=model_data,
headers={'Authorization': 'Token your-token-here'}
)
model = response.json()
```
### JavaScript Client Example
```javascript
// Get dashboard data
fetch('/api/analytics/dashboard/123/data/', {
headers: {
'Authorization': 'Token your-token-here',
'Content-Type': 'application/json'
}
})
.then(response => response.json())
.then(data => {
console.log('Dashboard data:', data);
// Update dashboard UI
});
```
## Best Practices
1. **Use appropriate time windows** for KPI calculations
2. **Monitor model performance** and retrain when accuracy drops
3. **Validate cost analyses** before using for business decisions
4. **Set up alerts** for critical anomalies and threshold breaches
5. **Regular cleanup** of expired insights and old measurements
6. **Use pagination** for large datasets
7. **Cache frequently accessed data** to improve performance
## Support
For technical support or questions about the Analytics & Predictive Insights API:
- **Documentation**: This API documentation
- **Issues**: Report issues through the project repository
- **Contact**: Contact the development team for assistance

View File

@@ -0,0 +1,338 @@
"""
Admin configuration for analytics_predictive_insights app
"""
from django.contrib import admin
from django.utils.html import format_html
from django.urls import reverse
from django.utils.safestring import mark_safe
from .models import (
KPIMetric, KPIMeasurement, IncidentRecurrenceAnalysis, PredictiveModel,
AnomalyDetection, CostImpactAnalysis, DashboardConfiguration,
HeatmapData, PredictiveInsight
)
@admin.register(KPIMetric)
class KPIMetricAdmin(admin.ModelAdmin):
"""Admin interface for KPI metrics"""
list_display = [
'name', 'metric_type', 'aggregation_type', 'is_active',
'is_system_metric', 'created_by', 'created_at'
]
list_filter = [
'metric_type', 'aggregation_type', 'is_active',
'is_system_metric', 'created_at'
]
search_fields = ['name', 'description']
readonly_fields = ['id', 'created_at', 'updated_at']
fieldsets = (
('Basic Information', {
'fields': ('id', 'name', 'description', 'metric_type', 'aggregation_type')
}),
('Targeting Criteria', {
'fields': ('incident_categories', 'incident_severities', 'incident_priorities')
}),
('Configuration', {
'fields': ('calculation_formula', 'time_window_hours', 'is_active', 'is_system_metric')
}),
('Metadata', {
'fields': ('created_by', 'created_at', 'updated_at'),
'classes': ('collapse',)
})
)
@admin.register(KPIMeasurement)
class KPIMeasurementAdmin(admin.ModelAdmin):
"""Admin interface for KPI measurements"""
list_display = [
'metric', 'value', 'unit', 'incident_count',
'measurement_period_start', 'calculated_at'
]
list_filter = [
'metric__metric_type', 'unit', 'calculated_at'
]
search_fields = ['metric__name']
readonly_fields = ['id', 'calculated_at']
fieldsets = (
('Measurement Details', {
'fields': ('id', 'metric', 'value', 'unit')
}),
('Time Period', {
'fields': ('measurement_period_start', 'measurement_period_end')
}),
('Context', {
'fields': ('incident_count', 'sample_size', 'metadata')
}),
('Metadata', {
'fields': ('calculated_at',),
'classes': ('collapse',)
})
)
@admin.register(IncidentRecurrenceAnalysis)
class IncidentRecurrenceAnalysisAdmin(admin.ModelAdmin):
"""Admin interface for incident recurrence analysis"""
list_display = [
'primary_incident', 'recurrence_type', 'confidence_score',
'recurrence_rate', 'is_resolved', 'created_at'
]
list_filter = [
'recurrence_type', 'is_resolved', 'created_at'
]
search_fields = ['primary_incident__title']
readonly_fields = ['id', 'created_at', 'updated_at']
fieldsets = (
('Analysis Details', {
'fields': ('id', 'primary_incident', 'recurring_incidents', 'recurrence_type', 'confidence_score', 'recurrence_rate')
}),
('Pattern Characteristics', {
'fields': ('common_keywords', 'common_categories', 'time_pattern')
}),
('Impact Analysis', {
'fields': ('total_affected_users', 'total_downtime_hours', 'estimated_cost_impact')
}),
('Recommendations', {
'fields': ('prevention_recommendations', 'automation_opportunities')
}),
('Status', {
'fields': ('is_resolved', 'resolution_actions')
}),
('Metadata', {
'fields': ('created_at', 'updated_at', 'model_version'),
'classes': ('collapse',)
})
)
@admin.register(PredictiveModel)
class PredictiveModelAdmin(admin.ModelAdmin):
"""Admin interface for predictive models"""
list_display = [
'name', 'model_type', 'algorithm_type', 'status',
'accuracy_score', 'last_trained_at', 'created_at'
]
list_filter = [
'model_type', 'algorithm_type', 'status', 'created_at'
]
search_fields = ['name', 'description']
readonly_fields = ['id', 'created_at', 'updated_at']
fieldsets = (
('Basic Information', {
'fields': ('id', 'name', 'description', 'model_type', 'algorithm_type')
}),
('Model Configuration', {
'fields': ('model_config', 'feature_columns', 'target_column')
}),
('Training Configuration', {
'fields': ('training_data_period_days', 'min_training_samples')
}),
('Performance Metrics', {
'fields': ('accuracy_score', 'precision_score', 'recall_score', 'f1_score')
}),
('Status and Metadata', {
'fields': ('status', 'version', 'model_file_path', 'last_trained_at', 'training_duration_seconds', 'training_samples_count')
}),
('Retraining Configuration', {
'fields': ('auto_retrain_enabled', 'retrain_frequency_days', 'performance_threshold')
}),
('Metadata', {
'fields': ('created_by', 'created_at', 'updated_at'),
'classes': ('collapse',)
})
)
@admin.register(AnomalyDetection)
class AnomalyDetectionAdmin(admin.ModelAdmin):
"""Admin interface for anomaly detection results"""
list_display = [
'anomaly_type', 'severity', 'status', 'confidence_score',
'detected_at', 'resolved_at'
]
list_filter = [
'anomaly_type', 'severity', 'status', 'detected_at'
]
search_fields = ['description', 'model__name']
readonly_fields = ['id', 'detected_at']
fieldsets = (
('Detection Details', {
'fields': ('id', 'model', 'anomaly_type', 'severity', 'status')
}),
('Detection Metrics', {
'fields': ('confidence_score', 'anomaly_score', 'threshold_used')
}),
('Time Context', {
'fields': ('detected_at', 'time_window_start', 'time_window_end')
}),
('Related Data', {
'fields': ('related_incidents', 'affected_services', 'affected_metrics')
}),
('Analysis', {
'fields': ('description', 'root_cause_analysis', 'impact_assessment')
}),
('Actions', {
'fields': ('actions_taken', 'resolved_at', 'resolved_by')
}),
('Metadata', {
'fields': ('metadata',),
'classes': ('collapse',)
})
)
@admin.register(CostImpactAnalysis)
class CostImpactAnalysisAdmin(admin.ModelAdmin):
"""Admin interface for cost impact analysis"""
list_display = [
'incident', 'cost_type', 'cost_amount', 'currency',
'is_validated', 'created_at'
]
list_filter = [
'cost_type', 'currency', 'is_validated', 'created_at'
]
search_fields = ['incident__title', 'business_unit']
readonly_fields = ['id', 'created_at', 'updated_at']
fieldsets = (
('Cost Details', {
'fields': ('id', 'incident', 'cost_type', 'cost_amount', 'currency')
}),
('Calculation Details', {
'fields': ('calculation_method', 'calculation_details')
}),
('Impact Metrics', {
'fields': ('downtime_hours', 'affected_users', 'revenue_impact')
}),
('Business Context', {
'fields': ('business_unit', 'service_tier')
}),
('Validation', {
'fields': ('is_validated', 'validated_by', 'validated_at', 'validation_notes')
}),
('Metadata', {
'fields': ('created_at', 'updated_at'),
'classes': ('collapse',)
})
)
@admin.register(DashboardConfiguration)
class DashboardConfigurationAdmin(admin.ModelAdmin):
"""Admin interface for dashboard configurations"""
list_display = [
'name', 'dashboard_type', 'is_active', 'is_public',
'auto_refresh_enabled', 'created_by', 'created_at'
]
list_filter = [
'dashboard_type', 'is_active', 'is_public', 'auto_refresh_enabled', 'created_at'
]
search_fields = ['name', 'description']
readonly_fields = ['id', 'created_at', 'updated_at']
fieldsets = (
('Basic Information', {
'fields': ('id', 'name', 'description', 'dashboard_type')
}),
('Configuration', {
'fields': ('layout_config', 'widget_configs')
}),
('Access Control', {
'fields': ('is_public', 'allowed_users', 'allowed_roles')
}),
('Refresh Configuration', {
'fields': ('auto_refresh_enabled', 'refresh_interval_seconds')
}),
('Status', {
'fields': ('is_active',)
}),
('Metadata', {
'fields': ('created_by', 'created_at', 'updated_at'),
'classes': ('collapse',)
})
)
@admin.register(HeatmapData)
class HeatmapDataAdmin(admin.ModelAdmin):
"""Admin interface for heatmap data"""
list_display = [
'name', 'heatmap_type', 'time_granularity',
'aggregation_method', 'created_at'
]
list_filter = [
'heatmap_type', 'time_granularity', 'aggregation_method', 'created_at'
]
search_fields = ['name']
readonly_fields = ['id', 'created_at', 'updated_at']
fieldsets = (
('Basic Information', {
'fields': ('id', 'name', 'heatmap_type')
}),
('Time Configuration', {
'fields': ('time_period_start', 'time_period_end', 'time_granularity')
}),
('Data Configuration', {
'fields': ('data_points', 'color_scheme', 'aggregation_method')
}),
('Metadata', {
'fields': ('created_at', 'updated_at'),
'classes': ('collapse',)
})
)
@admin.register(PredictiveInsight)
class PredictiveInsightAdmin(admin.ModelAdmin):
"""Admin interface for predictive insights"""
list_display = [
'title', 'insight_type', 'confidence_level', 'confidence_score',
'is_acknowledged', 'is_validated', 'generated_at'
]
list_filter = [
'insight_type', 'confidence_level', 'is_acknowledged',
'is_validated', 'generated_at'
]
search_fields = ['title', 'description', 'model__name']
readonly_fields = ['id', 'generated_at']
fieldsets = (
('Insight Details', {
'fields': ('id', 'model', 'insight_type', 'title', 'description', 'confidence_level', 'confidence_score')
}),
('Prediction Details', {
'fields': ('predicted_value', 'prediction_horizon', 'prediction_date')
}),
('Context', {
'fields': ('input_features', 'supporting_evidence', 'related_incidents', 'affected_services')
}),
('Recommendations', {
'fields': ('recommendations', 'risk_assessment')
}),
('Status', {
'fields': ('is_acknowledged', 'acknowledged_by', 'acknowledged_at')
}),
('Validation', {
'fields': ('is_validated', 'actual_value', 'validation_accuracy')
}),
('Metadata', {
'fields': ('generated_at', 'expires_at'),
'classes': ('collapse',)
})
)

View File

@@ -0,0 +1,20 @@
"""
Analytics & Predictive Insights app configuration
"""
from django.apps import AppConfig
class AnalyticsPredictiveInsightsConfig(AppConfig):
"""Configuration for the analytics_predictive_insights app"""
default_auto_field = 'django.db.models.BigAutoField'
name = 'analytics_predictive_insights'
verbose_name = 'Analytics & Predictive Insights'
def ready(self):
"""Initialize the app when Django starts"""
# Import signal handlers
try:
import analytics_predictive_insights.signals
except ImportError:
pass

View File

@@ -0,0 +1 @@
# Management commands for analytics_predictive_insights

View File

@@ -0,0 +1 @@
# Management commands

View File

@@ -0,0 +1,216 @@
"""
Management command to calculate KPI measurements
"""
from django.core.management.base import BaseCommand, CommandError
from django.utils import timezone
from datetime import timedelta
from analytics_predictive_insights.models import KPIMetric, KPIMeasurement
from incident_intelligence.models import Incident
class Command(BaseCommand):
"""Calculate KPI measurements for all active metrics"""
help = 'Calculate KPI measurements for all active metrics'
def add_arguments(self, parser):
parser.add_argument(
'--metric-id',
type=str,
help='Calculate KPI for a specific metric ID only'
)
parser.add_argument(
'--time-window',
type=int,
default=24,
help='Time window in hours for KPI calculation (default: 24)'
)
parser.add_argument(
'--force',
action='store_true',
help='Force recalculation even if recent measurement exists'
)
def handle(self, *args, **options):
"""Handle the command execution"""
metric_id = options.get('metric_id')
time_window = options.get('time_window', 24)
force = options.get('force', False)
try:
if metric_id:
metrics = KPIMetric.objects.filter(id=metric_id, is_active=True)
if not metrics.exists():
raise CommandError(f'No active metric found with ID: {metric_id}')
else:
metrics = KPIMetric.objects.filter(is_active=True)
self.stdout.write(f'Calculating KPIs for {metrics.count()} metrics...')
total_calculated = 0
for metric in metrics:
try:
calculated = self._calculate_metric(metric, time_window, force)
if calculated:
total_calculated += 1
self.stdout.write(
self.style.SUCCESS(f'✓ Calculated KPI for {metric.name}')
)
else:
self.stdout.write(
self.style.WARNING(f'⚠ Skipped KPI for {metric.name} (recent measurement exists)')
)
except Exception as e:
self.stdout.write(
self.style.ERROR(f'✗ Error calculating KPI for {metric.name}: {str(e)}')
)
self.stdout.write(
self.style.SUCCESS(f'Successfully calculated {total_calculated} KPIs')
)
except Exception as e:
raise CommandError(f'Error executing command: {str(e)}')
def _calculate_metric(self, metric, time_window_hours, force=False):
"""Calculate KPI measurement for a specific metric"""
end_time = timezone.now()
start_time = end_time - timedelta(hours=time_window_hours)
# Check if recent measurement exists
if not force:
recent_measurement = KPIMeasurement.objects.filter(
metric=metric,
calculated_at__gte=end_time - timedelta(hours=1)
).first()
if recent_measurement:
return False
# Get incidents in the time window
incidents = Incident.objects.filter(
created_at__gte=start_time,
created_at__lte=end_time
)
# Apply metric filters
if metric.incident_categories:
incidents = incidents.filter(category__in=metric.incident_categories)
if metric.incident_severities:
incidents = incidents.filter(severity__in=metric.incident_severities)
if metric.incident_priorities:
incidents = incidents.filter(priority__in=metric.incident_priorities)
# Calculate metric value based on type
if metric.metric_type == 'MTTA':
value, unit = self._calculate_mtta(incidents)
elif metric.metric_type == 'MTTR':
value, unit = self._calculate_mttr(incidents)
elif metric.metric_type == 'INCIDENT_COUNT':
value, unit = incidents.count(), 'count'
elif metric.metric_type == 'RESOLUTION_RATE':
value, unit = self._calculate_resolution_rate(incidents)
elif metric.metric_type == 'AVAILABILITY':
value, unit = self._calculate_availability(incidents)
else:
value, unit = incidents.count(), 'count'
# Create or update measurement
measurement, created = KPIMeasurement.objects.get_or_create(
metric=metric,
measurement_period_start=start_time,
measurement_period_end=end_time,
defaults={
'value': value,
'unit': unit,
'incident_count': incidents.count(),
'sample_size': incidents.count()
}
)
if not created:
measurement.value = value
measurement.unit = unit
measurement.incident_count = incidents.count()
measurement.sample_size = incidents.count()
measurement.save()
return True
def _calculate_mtta(self, incidents):
"""Calculate Mean Time to Acknowledge"""
acknowledged_incidents = incidents.filter(
status__in=['IN_PROGRESS', 'RESOLVED', 'CLOSED']
).exclude(assigned_to__isnull=True)
if not acknowledged_incidents.exists():
return 0, 'minutes'
total_time = timedelta()
count = 0
for incident in acknowledged_incidents:
# Simplified calculation - in practice, you'd track acknowledgment time
if incident.updated_at and incident.created_at:
time_diff = incident.updated_at - incident.created_at
total_time += time_diff
count += 1
if count > 0:
avg_time = total_time / count
return avg_time.total_seconds() / 60, 'minutes' # Convert to minutes
return 0, 'minutes'
def _calculate_mttr(self, incidents):
"""Calculate Mean Time to Resolve"""
resolved_incidents = incidents.filter(
status__in=['RESOLVED', 'CLOSED'],
resolved_at__isnull=False
)
if not resolved_incidents.exists():
return 0, 'hours'
total_time = timedelta()
count = 0
for incident in resolved_incidents:
if incident.resolved_at and incident.created_at:
time_diff = incident.resolved_at - incident.created_at
total_time += time_diff
count += 1
if count > 0:
avg_time = total_time / count
return avg_time.total_seconds() / 3600, 'hours' # Convert to hours
return 0, 'hours'
def _calculate_resolution_rate(self, incidents):
"""Calculate resolution rate"""
total_incidents = incidents.count()
if total_incidents == 0:
return 0, 'percentage'
resolved_incidents = incidents.filter(
status__in=['RESOLVED', 'CLOSED']
).count()
rate = (resolved_incidents / total_incidents) * 100
return rate, 'percentage'
def _calculate_availability(self, incidents):
"""Calculate service availability"""
# Simplified availability calculation
# In practice, you'd need more sophisticated uptime tracking
total_incidents = incidents.count()
if total_incidents == 0:
return 100, 'percentage'
# Assume availability decreases with incident count
# This is a simplified calculation
availability = max(0, 100 - (total_incidents * 0.1))
return availability, 'percentage'

View File

@@ -0,0 +1,63 @@
"""
Management command to run anomaly detection
"""
from django.core.management.base import BaseCommand, CommandError
from analytics_predictive_insights.models import PredictiveModel
from analytics_predictive_insights.ml.anomaly_detection import AnomalyDetectionService
class Command(BaseCommand):
"""Run anomaly detection using active models"""
help = 'Run anomaly detection using all active anomaly detection models'
def add_arguments(self, parser):
parser.add_argument(
'--model-id',
type=str,
help='Run anomaly detection for a specific model ID only'
)
parser.add_argument(
'--time-window',
type=int,
default=24,
help='Time window in hours for anomaly detection (default: 24)'
)
def handle(self, *args, **options):
"""Handle the command execution"""
model_id = options.get('model_id')
time_window = options.get('time_window', 24)
try:
# Initialize anomaly detection service
anomaly_service = AnomalyDetectionService()
self.stdout.write('Starting anomaly detection...')
# Run anomaly detection
total_anomalies = anomaly_service.run_anomaly_detection(model_id)
if total_anomalies > 0:
self.stdout.write(
self.style.SUCCESS(f'✓ Detected {total_anomalies} anomalies')
)
else:
self.stdout.write(
self.style.WARNING('⚠ No anomalies detected')
)
# Get summary
summary = anomaly_service.get_anomaly_summary(time_window)
self.stdout.write('\nAnomaly Summary:')
self.stdout.write(f' Total anomalies: {summary["total_anomalies"]}')
self.stdout.write(f' Critical: {summary["critical_anomalies"]}')
self.stdout.write(f' High: {summary["high_anomalies"]}')
self.stdout.write(f' Medium: {summary["medium_anomalies"]}')
self.stdout.write(f' Low: {summary["low_anomalies"]}')
self.stdout.write(f' Unresolved: {summary["unresolved_anomalies"]}')
self.stdout.write(f' False positive rate: {summary["false_positive_rate"]:.2f}%')
except Exception as e:
raise CommandError(f'Error running anomaly detection: {str(e)}')

View File

@@ -0,0 +1,108 @@
"""
Management command to train predictive models
"""
from django.core.management.base import BaseCommand, CommandError
from analytics_predictive_insights.models import PredictiveModel
from analytics_predictive_insights.ml.predictive_models import PredictiveModelService
class Command(BaseCommand):
"""Train predictive models"""
help = 'Train predictive models that are in training status'
def add_arguments(self, parser):
parser.add_argument(
'--model-id',
type=str,
help='Train a specific model ID only'
)
parser.add_argument(
'--force',
action='store_true',
help='Force retraining of active models'
)
def handle(self, *args, **options):
"""Handle the command execution"""
model_id = options.get('model_id')
force = options.get('force', False)
try:
# Initialize predictive model service
model_service = PredictiveModelService()
# Get models to train
if model_id:
models = PredictiveModel.objects.filter(id=model_id)
if not models.exists():
raise CommandError(f'No model found with ID: {model_id}')
else:
if force:
models = PredictiveModel.objects.filter(
model_type__in=[
'INCIDENT_PREDICTION',
'SEVERITY_PREDICTION',
'RESOLUTION_TIME_PREDICTION',
'COST_PREDICTION'
]
)
else:
models = PredictiveModel.objects.filter(status='TRAINING')
self.stdout.write(f'Training {models.count()} models...')
total_trained = 0
total_failed = 0
for model in models:
try:
self.stdout.write(f'Training model: {model.name}...')
result = model_service.train_model(str(model.id))
if result['success']:
total_trained += 1
self.stdout.write(
self.style.SUCCESS(f'✓ Successfully trained {model.name}')
)
# Display metrics
if 'metrics' in result:
metrics = result['metrics']
self.stdout.write(f' Accuracy: {metrics.get("accuracy", "N/A")}')
self.stdout.write(f' Precision: {metrics.get("precision", "N/A")}')
self.stdout.write(f' Recall: {metrics.get("recall", "N/A")}')
self.stdout.write(f' F1 Score: {metrics.get("f1_score", "N/A")}')
self.stdout.write(f' R2 Score: {metrics.get("r2_score", "N/A")}')
self.stdout.write(f' Training samples: {result.get("training_samples", "N/A")}')
self.stdout.write(f' Training duration: {result.get("training_duration", "N/A")} seconds')
else:
total_failed += 1
self.stdout.write(
self.style.ERROR(f'✗ Failed to train {model.name}: {result.get("error", "Unknown error")}')
)
except Exception as e:
total_failed += 1
self.stdout.write(
self.style.ERROR(f'✗ Error training {model.name}: {str(e)}')
)
self.stdout.write('\nTraining Summary:')
self.stdout.write(f' Successfully trained: {total_trained}')
self.stdout.write(f' Failed: {total_failed}')
if total_trained > 0:
self.stdout.write(
self.style.SUCCESS(f'✓ Training completed successfully')
)
else:
self.stdout.write(
self.style.WARNING('⚠ No models were successfully trained')
)
except Exception as e:
raise CommandError(f'Error executing command: {str(e)}')

View File

@@ -0,0 +1,311 @@
# Generated by Django 5.2.6 on 2025-09-18 17:16
import django.core.validators
import django.db.models.deletion
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
('incident_intelligence', '0004_incident_oncall_assignment_incident_sla_override_and_more'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='HeatmapData',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200)),
('heatmap_type', models.CharField(choices=[('INCIDENT_FREQUENCY', 'Incident Frequency'), ('RESOLUTION_TIME', 'Resolution Time'), ('COST_IMPACT', 'Cost Impact'), ('ANOMALY_DENSITY', 'Anomaly Density'), ('SLA_PERFORMANCE', 'SLA Performance')], max_length=20)),
('time_period_start', models.DateTimeField()),
('time_period_end', models.DateTimeField()),
('time_granularity', models.CharField(choices=[('HOUR', 'Hour'), ('DAY', 'Day'), ('WEEK', 'Week'), ('MONTH', 'Month')], max_length=20)),
('data_points', models.JSONField(help_text='Heatmap data points with coordinates and values')),
('color_scheme', models.CharField(default='viridis', help_text='Color scheme for the heatmap', max_length=50)),
('aggregation_method', models.CharField(choices=[('SUM', 'Sum'), ('AVERAGE', 'Average'), ('COUNT', 'Count'), ('MAX', 'Maximum'), ('MIN', 'Minimum')], max_length=20)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
],
options={
'ordering': ['-created_at'],
'indexes': [models.Index(fields=['heatmap_type', 'time_period_start'], name='analytics_p_heatmap_61786e_idx'), models.Index(fields=['time_granularity'], name='analytics_p_time_gr_6c8e73_idx')],
},
),
migrations.CreateModel(
name='KPIMetric',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200)),
('description', models.TextField()),
('metric_type', models.CharField(choices=[('MTTA', 'Mean Time to Acknowledge'), ('MTTR', 'Mean Time to Resolve'), ('MTBF', 'Mean Time Between Failures'), ('MTBSI', 'Mean Time Between Service Incidents'), ('AVAILABILITY', 'Service Availability'), ('INCIDENT_COUNT', 'Incident Count'), ('RESOLUTION_RATE', 'Resolution Rate'), ('ESCALATION_RATE', 'Escalation Rate'), ('CUSTOM', 'Custom Metric')], max_length=20)),
('aggregation_type', models.CharField(choices=[('AVERAGE', 'Average'), ('MEDIAN', 'Median'), ('MIN', 'Minimum'), ('MAX', 'Maximum'), ('SUM', 'Sum'), ('COUNT', 'Count'), ('PERCENTILE_95', '95th Percentile'), ('PERCENTILE_99', '99th Percentile')], max_length=20)),
('incident_categories', models.JSONField(default=list, help_text='List of incident categories this metric applies to')),
('incident_severities', models.JSONField(default=list, help_text='List of incident severities this metric applies to')),
('incident_priorities', models.JSONField(default=list, help_text='List of incident priorities this metric applies to')),
('calculation_formula', models.TextField(blank=True, help_text='Custom calculation formula for complex metrics', null=True)),
('time_window_hours', models.PositiveIntegerField(default=24, help_text='Time window for metric calculation in hours')),
('is_active', models.BooleanField(default=True)),
('is_system_metric', models.BooleanField(default=False, help_text='Whether this is a system-defined metric')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['name'],
},
),
migrations.CreateModel(
name='KPIMeasurement',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('value', models.DecimalField(decimal_places=4, max_digits=15)),
('unit', models.CharField(help_text='Unit of measurement (minutes, hours, percentage, etc.)', max_length=50)),
('measurement_period_start', models.DateTimeField()),
('measurement_period_end', models.DateTimeField()),
('incident_count', models.PositiveIntegerField(default=0, help_text='Number of incidents included in this measurement')),
('sample_size', models.PositiveIntegerField(default=0, help_text='Total sample size for this measurement')),
('metadata', models.JSONField(default=dict, help_text='Additional metadata for this measurement')),
('calculated_at', models.DateTimeField(auto_now_add=True)),
('metric', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='measurements', to='analytics_predictive_insights.kpimetric')),
],
options={
'ordering': ['-calculated_at'],
},
),
migrations.CreateModel(
name='PredictiveModel',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200)),
('description', models.TextField()),
('model_type', models.CharField(choices=[('ANOMALY_DETECTION', 'Anomaly Detection'), ('INCIDENT_PREDICTION', 'Incident Prediction'), ('SEVERITY_PREDICTION', 'Severity Prediction'), ('RESOLUTION_TIME_PREDICTION', 'Resolution Time Prediction'), ('ESCALATION_PREDICTION', 'Escalation Prediction'), ('COST_PREDICTION', 'Cost Impact Prediction')], max_length=30)),
('algorithm_type', models.CharField(choices=[('ISOLATION_FOREST', 'Isolation Forest'), ('LSTM', 'Long Short-Term Memory'), ('RANDOM_FOREST', 'Random Forest'), ('XGBOOST', 'XGBoost'), ('SVM', 'Support Vector Machine'), ('NEURAL_NETWORK', 'Neural Network'), ('ARIMA', 'ARIMA'), ('PROPHET', 'Prophet')], max_length=20)),
('model_config', models.JSONField(default=dict, help_text='Model-specific configuration parameters')),
('feature_columns', models.JSONField(default=list, help_text='List of feature columns used by the model')),
('target_column', models.CharField(help_text='Target column for prediction', max_length=100)),
('training_data_period_days', models.PositiveIntegerField(default=90, help_text='Number of days of training data to use')),
('min_training_samples', models.PositiveIntegerField(default=100, help_text='Minimum number of samples required for training')),
('accuracy_score', models.FloatField(blank=True, null=True, validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('precision_score', models.FloatField(blank=True, null=True, validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('recall_score', models.FloatField(blank=True, null=True, validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('f1_score', models.FloatField(blank=True, null=True, validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('status', models.CharField(choices=[('TRAINING', 'Training'), ('ACTIVE', 'Active'), ('INACTIVE', 'Inactive'), ('RETRAINING', 'Retraining'), ('ERROR', 'Error')], default='TRAINING', max_length=20)),
('version', models.CharField(default='1.0', max_length=20)),
('model_file_path', models.CharField(blank=True, help_text='Path to the trained model file', max_length=500, null=True)),
('last_trained_at', models.DateTimeField(blank=True, null=True)),
('training_duration_seconds', models.PositiveIntegerField(blank=True, null=True)),
('training_samples_count', models.PositiveIntegerField(blank=True, null=True)),
('auto_retrain_enabled', models.BooleanField(default=True)),
('retrain_frequency_days', models.PositiveIntegerField(default=7)),
('performance_threshold', models.FloatField(default=0.8, help_text='Performance threshold below which model should be retrained', validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['-created_at'],
},
),
migrations.CreateModel(
name='PredictiveInsight',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('insight_type', models.CharField(choices=[('INCIDENT_PREDICTION', 'Incident Prediction'), ('SEVERITY_PREDICTION', 'Severity Prediction'), ('RESOLUTION_TIME_PREDICTION', 'Resolution Time Prediction'), ('COST_PREDICTION', 'Cost Prediction'), ('TREND_ANALYSIS', 'Trend Analysis'), ('PATTERN_DETECTION', 'Pattern Detection')], max_length=30)),
('title', models.CharField(max_length=200)),
('description', models.TextField()),
('confidence_level', models.CharField(choices=[('LOW', 'Low Confidence'), ('MEDIUM', 'Medium Confidence'), ('HIGH', 'High Confidence'), ('VERY_HIGH', 'Very High Confidence')], max_length=20)),
('confidence_score', models.FloatField(validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('predicted_value', models.JSONField(help_text='Predicted value or values')),
('prediction_horizon', models.PositiveIntegerField(help_text='Prediction horizon in hours')),
('prediction_date', models.DateTimeField(help_text='When the prediction is for')),
('input_features', models.JSONField(help_text='Input features used for the prediction')),
('supporting_evidence', models.JSONField(default=list, help_text='Supporting evidence for the prediction')),
('affected_services', models.JSONField(default=list, help_text='Services that may be affected')),
('recommendations', models.JSONField(default=list, help_text='AI-generated recommendations based on the insight')),
('risk_assessment', models.TextField(blank=True, help_text='Risk assessment based on the prediction', null=True)),
('is_acknowledged', models.BooleanField(default=False)),
('acknowledged_at', models.DateTimeField(blank=True, null=True)),
('is_validated', models.BooleanField(default=False)),
('actual_value', models.JSONField(blank=True, help_text='Actual value when prediction is validated', null=True)),
('validation_accuracy', models.FloatField(blank=True, null=True, validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('generated_at', models.DateTimeField(auto_now_add=True)),
('expires_at', models.DateTimeField(help_text='When this insight expires')),
('acknowledged_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='acknowledged_insights', to=settings.AUTH_USER_MODEL)),
('related_incidents', models.ManyToManyField(blank=True, related_name='predictive_insights', to='incident_intelligence.incident')),
('model', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='insights', to='analytics_predictive_insights.predictivemodel')),
],
options={
'ordering': ['-generated_at'],
},
),
migrations.CreateModel(
name='AnomalyDetection',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('anomaly_type', models.CharField(choices=[('STATISTICAL', 'Statistical Anomaly'), ('TEMPORAL', 'Temporal Anomaly'), ('PATTERN', 'Pattern Anomaly'), ('THRESHOLD', 'Threshold Breach'), ('BEHAVIORAL', 'Behavioral Anomaly')], max_length=20)),
('severity', models.CharField(choices=[('LOW', 'Low'), ('MEDIUM', 'Medium'), ('HIGH', 'High'), ('CRITICAL', 'Critical')], max_length=20)),
('status', models.CharField(choices=[('DETECTED', 'Detected'), ('INVESTIGATING', 'Investigating'), ('CONFIRMED', 'Confirmed'), ('FALSE_POSITIVE', 'False Positive'), ('RESOLVED', 'Resolved')], default='DETECTED', max_length=20)),
('confidence_score', models.FloatField(validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('anomaly_score', models.FloatField(help_text='Raw anomaly score from the model')),
('threshold_used', models.FloatField(help_text='Threshold used for anomaly detection')),
('detected_at', models.DateTimeField(auto_now_add=True)),
('time_window_start', models.DateTimeField()),
('time_window_end', models.DateTimeField()),
('affected_services', models.JSONField(default=list, help_text='Services affected by this anomaly')),
('affected_metrics', models.JSONField(default=list, help_text='Metrics that showed anomalous behavior')),
('description', models.TextField(help_text='Description of the anomaly')),
('root_cause_analysis', models.TextField(blank=True, help_text='Root cause analysis of the anomaly', null=True)),
('impact_assessment', models.TextField(blank=True, help_text="Assessment of the anomaly's impact", null=True)),
('actions_taken', models.JSONField(default=list, help_text='Actions taken in response to the anomaly')),
('resolved_at', models.DateTimeField(blank=True, null=True)),
('metadata', models.JSONField(default=dict, help_text='Additional metadata for this anomaly')),
('related_incidents', models.ManyToManyField(blank=True, related_name='anomaly_detections', to='incident_intelligence.incident')),
('resolved_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='resolved_anomalies', to=settings.AUTH_USER_MODEL)),
('model', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='anomaly_detections', to='analytics_predictive_insights.predictivemodel')),
],
options={
'ordering': ['-detected_at'],
},
),
migrations.CreateModel(
name='CostImpactAnalysis',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('cost_type', models.CharField(choices=[('DOWNTIME', 'Downtime Cost'), ('LOST_REVENUE', 'Lost Revenue'), ('PENALTY', 'Penalty Cost'), ('RESOURCE_COST', 'Resource Cost'), ('REPUTATION_COST', 'Reputation Cost'), ('COMPLIANCE_COST', 'Compliance Cost')], max_length=20)),
('cost_amount', models.DecimalField(decimal_places=2, help_text='Cost amount in USD', max_digits=15)),
('currency', models.CharField(default='USD', max_length=3)),
('calculation_method', models.CharField(help_text='Method used to calculate the cost', max_length=50)),
('calculation_details', models.JSONField(default=dict, help_text='Detailed breakdown of cost calculation')),
('downtime_hours', models.DecimalField(blank=True, decimal_places=2, help_text='Total downtime in hours', max_digits=10, null=True)),
('affected_users', models.PositiveIntegerField(blank=True, help_text='Number of users affected', null=True)),
('revenue_impact', models.DecimalField(blank=True, decimal_places=2, help_text='Revenue impact in USD', max_digits=15, null=True)),
('business_unit', models.CharField(blank=True, help_text='Business unit affected', max_length=100, null=True)),
('service_tier', models.CharField(blank=True, help_text='Service tier (e.g., Premium, Standard)', max_length=50, null=True)),
('is_validated', models.BooleanField(default=False)),
('validated_at', models.DateTimeField(blank=True, null=True)),
('validation_notes', models.TextField(blank=True, null=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('incident', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='cost_analyses', to='incident_intelligence.incident')),
('validated_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='validated_cost_analyses', to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['-created_at'],
'indexes': [models.Index(fields=['incident', 'cost_type'], name='analytics_p_inciden_c66cda_idx'), models.Index(fields=['cost_amount'], name='analytics_p_cost_am_92cb70_idx'), models.Index(fields=['is_validated'], name='analytics_p_is_vali_bf5116_idx')],
},
),
migrations.CreateModel(
name='DashboardConfiguration',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200)),
('description', models.TextField()),
('dashboard_type', models.CharField(choices=[('EXECUTIVE', 'Executive Dashboard'), ('OPERATIONAL', 'Operational Dashboard'), ('TECHNICAL', 'Technical Dashboard'), ('CUSTOM', 'Custom Dashboard')], max_length=20)),
('layout_config', models.JSONField(default=dict, help_text='Dashboard layout configuration')),
('widget_configs', models.JSONField(default=list, help_text='Configuration for dashboard widgets')),
('is_public', models.BooleanField(default=False)),
('allowed_roles', models.JSONField(default=list, help_text='List of roles that can access this dashboard')),
('auto_refresh_enabled', models.BooleanField(default=True)),
('refresh_interval_seconds', models.PositiveIntegerField(default=300)),
('is_active', models.BooleanField(default=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('allowed_users', models.ManyToManyField(blank=True, related_name='accessible_dashboards', to=settings.AUTH_USER_MODEL)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['name'],
'indexes': [models.Index(fields=['dashboard_type', 'is_active'], name='analytics_p_dashboa_a8155f_idx'), models.Index(fields=['is_public'], name='analytics_p_is_publ_c4c7bd_idx')],
},
),
migrations.CreateModel(
name='IncidentRecurrenceAnalysis',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('recurrence_type', models.CharField(choices=[('EXACT_DUPLICATE', 'Exact Duplicate'), ('SIMILAR_PATTERN', 'Similar Pattern'), ('SEASONAL', 'Seasonal Recurrence'), ('TREND', 'Trend-based Recurrence'), ('CASCADE', 'Cascade Effect')], max_length=20)),
('confidence_score', models.FloatField(validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('recurrence_rate', models.FloatField(help_text='Rate of recurrence (incidents per time period)')),
('common_keywords', models.JSONField(default=list, help_text='Common keywords across recurring incidents')),
('common_categories', models.JSONField(default=list, help_text='Common categories across recurring incidents')),
('time_pattern', models.JSONField(default=dict, help_text='Time-based pattern analysis')),
('total_affected_users', models.PositiveIntegerField(default=0)),
('total_downtime_hours', models.DecimalField(decimal_places=2, default=0, max_digits=10)),
('estimated_cost_impact', models.DecimalField(decimal_places=2, default=0, max_digits=15)),
('prevention_recommendations', models.JSONField(default=list, help_text='AI-generated recommendations to prevent recurrence')),
('automation_opportunities', models.JSONField(default=list, help_text='Potential automation opportunities identified')),
('is_resolved', models.BooleanField(default=False)),
('resolution_actions', models.JSONField(default=list, help_text='Actions taken to resolve the recurrence pattern')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('model_version', models.CharField(default='v1.0', max_length=50)),
('primary_incident', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='recurrence_analyses_as_primary', to='incident_intelligence.incident')),
('recurring_incidents', models.ManyToManyField(related_name='recurrence_analyses_as_recurring', to='incident_intelligence.incident')),
],
options={
'ordering': ['-confidence_score', '-created_at'],
'indexes': [models.Index(fields=['recurrence_type', 'confidence_score'], name='analytics_p_recurre_420fe9_idx'), models.Index(fields=['is_resolved'], name='analytics_p_is_reso_cdecdd_idx')],
},
),
migrations.AddIndex(
model_name='kpimetric',
index=models.Index(fields=['metric_type', 'is_active'], name='analytics_p_metric__8e1291_idx'),
),
migrations.AddIndex(
model_name='kpimetric',
index=models.Index(fields=['incident_categories'], name='analytics_p_inciden_fcc290_idx'),
),
migrations.AddIndex(
model_name='kpimetric',
index=models.Index(fields=['incident_severities'], name='analytics_p_inciden_601d71_idx'),
),
migrations.AddIndex(
model_name='kpimeasurement',
index=models.Index(fields=['metric', 'measurement_period_start'], name='analytics_p_metric__5c1184_idx'),
),
migrations.AddIndex(
model_name='kpimeasurement',
index=models.Index(fields=['calculated_at'], name='analytics_p_calcula_e8b072_idx'),
),
migrations.AddIndex(
model_name='predictivemodel',
index=models.Index(fields=['model_type', 'status'], name='analytics_p_model_t_b1e3f4_idx'),
),
migrations.AddIndex(
model_name='predictivemodel',
index=models.Index(fields=['algorithm_type'], name='analytics_p_algorit_1f51a1_idx'),
),
migrations.AddIndex(
model_name='predictivemodel',
index=models.Index(fields=['status'], name='analytics_p_status_ad4300_idx'),
),
migrations.AddIndex(
model_name='predictiveinsight',
index=models.Index(fields=['insight_type', 'confidence_score'], name='analytics_p_insight_ac65ec_idx'),
),
migrations.AddIndex(
model_name='predictiveinsight',
index=models.Index(fields=['prediction_date'], name='analytics_p_predict_d606fb_idx'),
),
migrations.AddIndex(
model_name='predictiveinsight',
index=models.Index(fields=['is_acknowledged'], name='analytics_p_is_ackn_16014e_idx'),
),
migrations.AddIndex(
model_name='anomalydetection',
index=models.Index(fields=['anomaly_type', 'severity'], name='analytics_p_anomaly_d51ee4_idx'),
),
migrations.AddIndex(
model_name='anomalydetection',
index=models.Index(fields=['status', 'detected_at'], name='analytics_p_status_c15b14_idx'),
),
migrations.AddIndex(
model_name='anomalydetection',
index=models.Index(fields=['confidence_score'], name='analytics_p_confide_c99920_idx'),
),
]

View File

@@ -0,0 +1,32 @@
# Generated by Django 5.2.6 on 2025-09-18 17:19
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('analytics_predictive_insights', '0001_initial'),
('automation_orchestration', '0002_autoremediationexecution_sla_instance_and_more'),
('security', '0002_user_emergency_contact_user_oncall_preferences_and_more'),
('sla_oncall', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='costimpactanalysis',
name='sla_instance',
field=models.ForeignKey(blank=True, help_text='Related SLA instance for cost calculation', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='cost_analyses', to='sla_oncall.slainstance'),
),
migrations.AddField(
model_name='incidentrecurrenceanalysis',
name='suggested_runbooks',
field=models.ManyToManyField(blank=True, help_text='Runbooks suggested to prevent recurrence', related_name='recurrence_analyses', to='automation_orchestration.runbook'),
),
migrations.AddField(
model_name='predictiveinsight',
name='data_classification',
field=models.ForeignKey(blank=True, help_text='Data classification level for this insight', null=True, on_delete=django.db.models.deletion.SET_NULL, to='security.dataclassification'),
),
]

View File

@@ -0,0 +1 @@
# ML components for analytics and predictive insights

View File

@@ -0,0 +1,491 @@
"""
ML-based anomaly detection for incident management
Implements various anomaly detection algorithms for identifying unusual patterns
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional, Any
from datetime import datetime, timedelta
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from scipy import stats
import logging
from django.utils import timezone
from django.db.models import Q, Avg, Count, Sum
from incident_intelligence.models import Incident
from ..models import AnomalyDetection, PredictiveModel
logger = logging.getLogger(__name__)
class AnomalyDetector:
"""Base class for anomaly detection algorithms"""
def __init__(self, model_config: Dict[str, Any] = None):
self.model_config = model_config or {}
self.scaler = StandardScaler()
self.is_fitted = False
def fit(self, data: pd.DataFrame) -> None:
"""Fit the anomaly detection model"""
raise NotImplementedError
def predict(self, data: pd.DataFrame) -> np.ndarray:
"""Predict anomalies in the data"""
raise NotImplementedError
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
"""Get anomaly scores for the data"""
raise NotImplementedError
class StatisticalAnomalyDetector(AnomalyDetector):
"""Statistical anomaly detection using z-score and IQR methods"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.z_threshold = self.model_config.get('z_threshold', 3.0)
self.iqr_multiplier = self.model_config.get('iqr_multiplier', 1.5)
self.stats_cache = {}
def fit(self, data: pd.DataFrame) -> None:
"""Calculate statistical parameters for anomaly detection"""
for column in data.columns:
if data[column].dtype in ['int64', 'float64']:
values = data[column].dropna()
if len(values) > 0:
self.stats_cache[column] = {
'mean': values.mean(),
'std': values.std(),
'q1': values.quantile(0.25),
'q3': values.quantile(0.75),
'iqr': values.quantile(0.75) - values.quantile(0.25)
}
self.is_fitted = True
def predict(self, data: pd.DataFrame) -> np.ndarray:
"""Predict anomalies using statistical methods"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
anomaly_flags = np.zeros(len(data), dtype=bool)
for column in data.columns:
if column in self.stats_cache and data[column].dtype in ['int64', 'float64']:
values = data[column].dropna()
if len(values) > 0:
stats = self.stats_cache[column]
# Z-score method
z_scores = np.abs((values - stats['mean']) / stats['std'])
z_anomalies = z_scores > self.z_threshold
# IQR method
lower_bound = stats['q1'] - self.iqr_multiplier * stats['iqr']
upper_bound = stats['q3'] + self.iqr_multiplier * stats['iqr']
iqr_anomalies = (values < lower_bound) | (values > upper_bound)
# Combine both methods
column_anomalies = z_anomalies | iqr_anomalies
anomaly_flags[values.index] |= column_anomalies
return anomaly_flags
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
"""Get anomaly scores based on z-scores"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
scores = np.zeros(len(data))
for column in data.columns:
if column in self.stats_cache and data[column].dtype in ['int64', 'float64']:
values = data[column].dropna()
if len(values) > 0:
stats = self.stats_cache[column]
z_scores = np.abs((values - stats['mean']) / stats['std'])
scores[values.index] += z_scores
return scores
class IsolationForestAnomalyDetector(AnomalyDetector):
"""Isolation Forest anomaly detection"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.contamination = self.model_config.get('contamination', 0.1)
self.n_estimators = self.model_config.get('n_estimators', 100)
self.model = IsolationForest(
contamination=self.contamination,
n_estimators=self.n_estimators,
random_state=42
)
def fit(self, data: pd.DataFrame) -> None:
"""Fit the Isolation Forest model"""
# Select numeric columns only
numeric_data = data.select_dtypes(include=[np.number])
if numeric_data.empty:
raise ValueError("No numeric columns found in data")
# Handle missing values
numeric_data = numeric_data.fillna(numeric_data.median())
# Scale the data
scaled_data = self.scaler.fit_transform(numeric_data)
# Fit the model
self.model.fit(scaled_data)
self.is_fitted = True
def predict(self, data: pd.DataFrame) -> np.ndarray:
"""Predict anomalies using Isolation Forest"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
# Select numeric columns only
numeric_data = data.select_dtypes(include=[np.number])
if numeric_data.empty:
return np.zeros(len(data), dtype=bool)
# Handle missing values
numeric_data = numeric_data.fillna(numeric_data.median())
# Scale the data
scaled_data = self.scaler.transform(numeric_data)
# Predict anomalies (-1 for anomalies, 1 for normal)
predictions = self.model.predict(scaled_data)
return predictions == -1
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
"""Get anomaly scores from Isolation Forest"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
# Select numeric columns only
numeric_data = data.select_dtypes(include=[np.number])
if numeric_data.empty:
return np.zeros(len(data))
# Handle missing values
numeric_data = numeric_data.fillna(numeric_data.median())
# Scale the data
scaled_data = self.scaler.transform(numeric_data)
# Get anomaly scores
scores = self.model.decision_function(scaled_data)
# Convert to positive scores (higher = more anomalous)
return -scores
class TemporalAnomalyDetector(AnomalyDetector):
"""Temporal anomaly detection for time series data"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.window_size = self.model_config.get('window_size', 24) # hours
self.threshold_multiplier = self.model_config.get('threshold_multiplier', 2.0)
self.temporal_stats = {}
def fit(self, data: pd.DataFrame) -> None:
"""Calculate temporal statistics for anomaly detection"""
if 'timestamp' not in data.columns:
raise ValueError("Timestamp column is required for temporal anomaly detection")
# Sort by timestamp
data_sorted = data.sort_values('timestamp')
# Calculate rolling statistics
for column in data_sorted.columns:
if column != 'timestamp' and data_sorted[column].dtype in ['int64', 'float64']:
# Calculate rolling mean and std
rolling_mean = data_sorted[column].rolling(window=self.window_size, min_periods=1).mean()
rolling_std = data_sorted[column].rolling(window=self.window_size, min_periods=1).std()
self.temporal_stats[column] = {
'rolling_mean': rolling_mean,
'rolling_std': rolling_std
}
self.is_fitted = True
def predict(self, data: pd.DataFrame) -> np.ndarray:
"""Predict temporal anomalies"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
if 'timestamp' not in data.columns:
return np.zeros(len(data), dtype=bool)
# Sort by timestamp
data_sorted = data.sort_values('timestamp')
anomaly_flags = np.zeros(len(data_sorted), dtype=bool)
for column in data_sorted.columns:
if column in self.temporal_stats and column != 'timestamp':
values = data_sorted[column]
rolling_mean = self.temporal_stats[column]['rolling_mean']
rolling_std = self.temporal_stats[column]['rolling_std']
# Calculate z-scores based on rolling statistics
z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8))
column_anomalies = z_scores > self.threshold_multiplier
anomaly_flags |= column_anomalies
return anomaly_flags
def get_anomaly_scores(self, data: pd.DataFrame) -> np.ndarray:
"""Get temporal anomaly scores"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
if 'timestamp' not in data.columns:
return np.zeros(len(data))
# Sort by timestamp
data_sorted = data.sort_values('timestamp')
scores = np.zeros(len(data_sorted))
for column in data_sorted.columns:
if column in self.temporal_stats and column != 'timestamp':
values = data_sorted[column]
rolling_mean = self.temporal_stats[column]['rolling_mean']
rolling_std = self.temporal_stats[column]['rolling_std']
# Calculate z-scores based on rolling statistics
z_scores = np.abs((values - rolling_mean) / (rolling_std + 1e-8))
scores += z_scores
return scores
class AnomalyDetectionEngine:
"""Main engine for anomaly detection"""
def __init__(self):
self.detectors = {
'statistical': StatisticalAnomalyDetector,
'isolation_forest': IsolationForestAnomalyDetector,
'temporal': TemporalAnomalyDetector
}
def create_detector(self, algorithm_type: str, model_config: Dict[str, Any] = None) -> AnomalyDetector:
"""Create an anomaly detector instance"""
if algorithm_type not in self.detectors:
raise ValueError(f"Unknown algorithm type: {algorithm_type}")
return self.detectors[algorithm_type](model_config)
def prepare_incident_data(self, time_window_hours: int = 24) -> pd.DataFrame:
"""Prepare incident data for anomaly detection"""
end_time = timezone.now()
start_time = end_time - timedelta(hours=time_window_hours)
# Get incidents from the time window
incidents = Incident.objects.filter(
created_at__gte=start_time,
created_at__lte=end_time
).values(
'id', 'created_at', 'severity', 'category', 'subcategory',
'affected_users', 'estimated_downtime', 'status'
)
if not incidents:
return pd.DataFrame()
# Convert to DataFrame
df = pd.DataFrame(list(incidents))
# Convert datetime to timestamp
df['timestamp'] = pd.to_datetime(df['created_at']).astype('int64') // 10**9
# Encode categorical variables
severity_mapping = {'LOW': 1, 'MEDIUM': 2, 'HIGH': 3, 'CRITICAL': 4, 'EMERGENCY': 5}
df['severity_encoded'] = df['severity'].map(severity_mapping).fillna(0)
# Convert estimated_downtime to hours
df['downtime_hours'] = df['estimated_downtime'].apply(
lambda x: x.total_seconds() / 3600 if x else 0
)
# Create time-based features
df['hour_of_day'] = pd.to_datetime(df['created_at']).dt.hour
df['day_of_week'] = pd.to_datetime(df['created_at']).dt.dayofweek
return df
def detect_anomalies(self, model: PredictiveModel, time_window_hours: int = 24) -> List[Dict[str, Any]]:
"""Detect anomalies using the specified model"""
try:
# Prepare data
data = self.prepare_incident_data(time_window_hours)
if data.empty:
logger.warning("No incident data found for anomaly detection")
return []
# Create detector
detector = self.create_detector(
model.algorithm_type,
model.model_config
)
# Fit the model
detector.fit(data)
# Predict anomalies
anomaly_flags = detector.predict(data)
anomaly_scores = detector.get_anomaly_scores(data)
# Process results
anomalies = []
for idx, is_anomaly in enumerate(anomaly_flags):
if is_anomaly:
incident_data = data.iloc[idx]
anomaly_data = {
'model': model,
'anomaly_type': self._determine_anomaly_type(model.algorithm_type),
'severity': self._determine_severity(anomaly_scores[idx]),
'confidence_score': min(1.0, max(0.0, anomaly_scores[idx] / 10.0)),
'anomaly_score': float(anomaly_scores[idx]),
'threshold_used': self._get_threshold(model.algorithm_type, model.model_config),
'time_window_start': timezone.now() - timedelta(hours=time_window_hours),
'time_window_end': timezone.now(),
'description': self._generate_description(incident_data, anomaly_scores[idx]),
'affected_services': [incident_data.get('category', 'Unknown')],
'affected_metrics': ['incident_frequency', 'severity_distribution'],
'metadata': {
'incident_id': str(incident_data['id']),
'detection_algorithm': model.algorithm_type,
'time_window_hours': time_window_hours
}
}
anomalies.append(anomaly_data)
return anomalies
except Exception as e:
logger.error(f"Error in anomaly detection: {str(e)}")
return []
def _determine_anomaly_type(self, algorithm_type: str) -> str:
"""Determine anomaly type based on algorithm"""
mapping = {
'statistical': 'STATISTICAL',
'isolation_forest': 'PATTERN',
'temporal': 'TEMPORAL'
}
return mapping.get(algorithm_type, 'STATISTICAL')
def _determine_severity(self, anomaly_score: float) -> str:
"""Determine severity based on anomaly score"""
if anomaly_score >= 5.0:
return 'CRITICAL'
elif anomaly_score >= 3.0:
return 'HIGH'
elif anomaly_score >= 2.0:
return 'MEDIUM'
else:
return 'LOW'
def _get_threshold(self, algorithm_type: str, model_config: Dict[str, Any]) -> float:
"""Get threshold used for anomaly detection"""
if algorithm_type == 'statistical':
return model_config.get('z_threshold', 3.0)
elif algorithm_type == 'isolation_forest':
return model_config.get('contamination', 0.1)
elif algorithm_type == 'temporal':
return model_config.get('threshold_multiplier', 2.0)
return 1.0
def _generate_description(self, incident_data: pd.Series, anomaly_score: float) -> str:
"""Generate description for the anomaly"""
severity = incident_data.get('severity', 'Unknown')
category = incident_data.get('category', 'Unknown')
affected_users = incident_data.get('affected_users', 0)
return f"Anomalous incident detected: {severity} severity incident in {category} category affecting {affected_users} users. Anomaly score: {anomaly_score:.2f}"
class AnomalyDetectionService:
"""Service for managing anomaly detection"""
def __init__(self):
self.engine = AnomalyDetectionEngine()
def run_anomaly_detection(self, model_id: str = None) -> int:
"""Run anomaly detection for all active models or a specific model"""
if model_id:
models = PredictiveModel.objects.filter(
id=model_id,
model_type='ANOMALY_DETECTION',
status='ACTIVE'
)
else:
models = PredictiveModel.objects.filter(
model_type='ANOMALY_DETECTION',
status='ACTIVE'
)
total_anomalies = 0
for model in models:
try:
# Detect anomalies
anomalies = self.engine.detect_anomalies(model)
# Save anomalies to database
for anomaly_data in anomalies:
AnomalyDetection.objects.create(**anomaly_data)
total_anomalies += 1
logger.info(f"Detected {len(anomalies)} anomalies using model {model.name}")
except Exception as e:
logger.error(f"Error running anomaly detection for model {model.name}: {str(e)}")
return total_anomalies
def get_anomaly_summary(self, time_window_hours: int = 24) -> Dict[str, Any]:
"""Get summary of recent anomalies"""
end_time = timezone.now()
start_time = end_time - timedelta(hours=time_window_hours)
anomalies = AnomalyDetection.objects.filter(
detected_at__gte=start_time,
detected_at__lte=end_time
)
return {
'total_anomalies': anomalies.count(),
'critical_anomalies': anomalies.filter(severity='CRITICAL').count(),
'high_anomalies': anomalies.filter(severity='HIGH').count(),
'medium_anomalies': anomalies.filter(severity='MEDIUM').count(),
'low_anomalies': anomalies.filter(severity='LOW').count(),
'unresolved_anomalies': anomalies.filter(
status__in=['DETECTED', 'INVESTIGATING']
).count(),
'false_positive_rate': self._calculate_false_positive_rate(anomalies),
'average_confidence': anomalies.aggregate(
avg=Avg('confidence_score')
)['avg'] or 0.0
}
def _calculate_false_positive_rate(self, anomalies) -> float:
"""Calculate false positive rate"""
total_anomalies = anomalies.count()
if total_anomalies == 0:
return 0.0
false_positives = anomalies.filter(status='FALSE_POSITIVE').count()
return (false_positives / total_anomalies) * 100

View File

@@ -0,0 +1,684 @@
"""
ML-based predictive models for incident management
Implements various predictive algorithms for incident prediction, severity prediction, and cost analysis
"""
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional, Any, Union
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score
import joblib
import logging
from django.utils import timezone
from django.db.models import Q, Avg, Count, Sum, Max, Min
from incident_intelligence.models import Incident
from ..models import PredictiveModel, PredictiveInsight, CostImpactAnalysis
logger = logging.getLogger(__name__)
class BasePredictiveModel:
"""Base class for predictive models"""
def __init__(self, model_config: Dict[str, Any] = None):
self.model_config = model_config or {}
self.scaler = StandardScaler()
self.label_encoders = {}
self.is_fitted = False
self.feature_columns = []
self.target_column = None
def prepare_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Prepare features for model training/prediction"""
raise NotImplementedError
def fit(self, X: pd.DataFrame, y: pd.Series) -> Dict[str, float]:
"""Fit the model and return performance metrics"""
raise NotImplementedError
def predict(self, X: pd.DataFrame) -> np.ndarray:
"""Make predictions"""
raise NotImplementedError
def get_feature_importance(self) -> Dict[str, float]:
"""Get feature importance scores"""
raise NotImplementedError
class IncidentPredictionModel(BasePredictiveModel):
"""Model for predicting incident occurrence"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.model = RandomForestClassifier(
n_estimators=self.model_config.get('n_estimators', 100),
max_depth=self.model_config.get('max_depth', 10),
random_state=42
)
def prepare_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Prepare features for incident prediction"""
features = pd.DataFrame()
# Time-based features
if 'timestamp' in data.columns:
timestamp = pd.to_datetime(data['timestamp'])
features['hour_of_day'] = timestamp.dt.hour
features['day_of_week'] = timestamp.dt.dayofweek
features['day_of_month'] = timestamp.dt.day
features['month'] = timestamp.dt.month
features['is_weekend'] = (timestamp.dt.dayofweek >= 5).astype(int)
features['is_business_hours'] = ((timestamp.dt.hour >= 9) & (timestamp.dt.hour <= 17)).astype(int)
# Historical incident features
if 'incident_count_1h' in data.columns:
features['incident_count_1h'] = data['incident_count_1h']
if 'incident_count_24h' in data.columns:
features['incident_count_24h'] = data['incident_count_24h']
if 'avg_severity_24h' in data.columns:
features['avg_severity_24h'] = data['avg_severity_24h']
# System metrics (if available)
system_metrics = ['cpu_usage', 'memory_usage', 'disk_usage', 'network_usage']
for metric in system_metrics:
if metric in data.columns:
features[metric] = data[metric]
# Service-specific features
if 'service_name' in data.columns:
# Encode service names
if 'service_name' not in self.label_encoders:
self.label_encoders['service_name'] = LabelEncoder()
features['service_encoded'] = self.label_encoders['service_name'].fit_transform(data['service_name'])
else:
features['service_encoded'] = self.label_encoders['service_name'].transform(data['service_name'])
return features
def fit(self, X: pd.DataFrame, y: pd.Series) -> Dict[str, float]:
"""Fit the incident prediction model"""
# Prepare features
X_processed = self.prepare_features(X)
self.feature_columns = X_processed.columns.tolist()
# Scale features
X_scaled = self.scaler.fit_transform(X_processed)
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(
X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
# Fit model
self.model.fit(X_train, y_train)
# Evaluate model
y_pred = self.model.predict(X_val)
y_pred_proba = self.model.predict_proba(X_val)[:, 1]
metrics = {
'accuracy': accuracy_score(y_val, y_pred),
'precision': precision_score(y_val, y_pred, average='weighted'),
'recall': recall_score(y_val, y_pred, average='weighted'),
'f1_score': f1_score(y_val, y_pred, average='weighted')
}
self.is_fitted = True
return metrics
def predict(self, X: pd.DataFrame) -> np.ndarray:
"""Predict incident probability"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
X_processed = self.prepare_features(X)
X_scaled = self.scaler.transform(X_processed)
# Return probability of incident occurrence
return self.model.predict_proba(X_scaled)[:, 1]
def get_feature_importance(self) -> Dict[str, float]:
"""Get feature importance scores"""
if not self.is_fitted:
return {}
importance_scores = self.model.feature_importances_
return dict(zip(self.feature_columns, importance_scores))
class SeverityPredictionModel(BasePredictiveModel):
"""Model for predicting incident severity"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.model = RandomForestClassifier(
n_estimators=self.model_config.get('n_estimators', 100),
max_depth=self.model_config.get('max_depth', 10),
random_state=42
)
self.severity_mapping = {
'LOW': 1, 'MEDIUM': 2, 'HIGH': 3, 'CRITICAL': 4, 'EMERGENCY': 5
}
self.reverse_severity_mapping = {v: k for k, v in self.severity_mapping.items()}
def prepare_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Prepare features for severity prediction"""
features = pd.DataFrame()
# Text-based features
if 'title' in data.columns:
features['title_length'] = data['title'].str.len()
features['title_word_count'] = data['title'].str.split().str.len()
if 'description' in data.columns:
features['description_length'] = data['description'].str.len()
features['description_word_count'] = data['description'].str.split().str.len()
# Categorical features
if 'category' in data.columns:
if 'category' not in self.label_encoders:
self.label_encoders['category'] = LabelEncoder()
features['category_encoded'] = self.label_encoders['category'].fit_transform(data['category'])
else:
features['category_encoded'] = self.label_encoders['category'].transform(data['category'])
if 'subcategory' in data.columns:
if 'subcategory' not in self.label_encoders:
self.label_encoders['subcategory'] = LabelEncoder()
features['subcategory_encoded'] = self.label_encoders['subcategory'].fit_transform(data['subcategory'])
else:
features['subcategory_encoded'] = self.label_encoders['subcategory'].transform(data['subcategory'])
# Impact features
if 'affected_users' in data.columns:
features['affected_users'] = data['affected_users']
features['affected_users_log'] = np.log1p(data['affected_users'])
# Time-based features
if 'created_at' in data.columns:
timestamp = pd.to_datetime(data['created_at'])
features['hour_of_day'] = timestamp.dt.hour
features['day_of_week'] = timestamp.dt.dayofweek
features['is_weekend'] = (timestamp.dt.dayofweek >= 5).astype(int)
features['is_business_hours'] = ((timestamp.dt.hour >= 9) & (timestamp.dt.hour <= 17)).astype(int)
# Historical features
if 'reporter_id' in data.columns:
# Count of previous incidents by reporter
features['reporter_incident_count'] = data.groupby('reporter_id')['reporter_id'].transform('count')
return features
def fit(self, X: pd.DataFrame, y: pd.Series) -> Dict[str, float]:
"""Fit the severity prediction model"""
# Prepare features
X_processed = self.prepare_features(X)
self.feature_columns = X_processed.columns.tolist()
# Encode target variable
y_encoded = y.map(self.severity_mapping)
# Scale features
X_scaled = self.scaler.fit_transform(X_processed)
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(
X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)
# Fit model
self.model.fit(X_train, y_train)
# Evaluate model
y_pred = self.model.predict(X_val)
metrics = {
'accuracy': accuracy_score(y_val, y_pred),
'precision': precision_score(y_val, y_pred, average='weighted'),
'recall': recall_score(y_val, y_pred, average='weighted'),
'f1_score': f1_score(y_val, y_pred, average='weighted')
}
self.is_fitted = True
return metrics
def predict(self, X: pd.DataFrame) -> np.ndarray:
"""Predict incident severity"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
X_processed = self.prepare_features(X)
X_scaled = self.scaler.transform(X_processed)
# Get predicted severity levels
y_pred_encoded = self.model.predict(X_scaled)
# Convert back to severity labels
return np.array([self.reverse_severity_mapping.get(level, 'MEDIUM') for level in y_pred_encoded])
def get_feature_importance(self) -> Dict[str, float]:
"""Get feature importance scores"""
if not self.is_fitted:
return {}
importance_scores = self.model.feature_importances_
return dict(zip(self.feature_columns, importance_scores))
class ResolutionTimePredictionModel(BasePredictiveModel):
"""Model for predicting incident resolution time"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.model = RandomForestRegressor(
n_estimators=self.model_config.get('n_estimators', 100),
max_depth=self.model_config.get('max_depth', 10),
random_state=42
)
def prepare_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Prepare features for resolution time prediction"""
features = pd.DataFrame()
# Severity features
if 'severity' in data.columns:
severity_mapping = {'LOW': 1, 'MEDIUM': 2, 'HIGH': 3, 'CRITICAL': 4, 'EMERGENCY': 5}
features['severity_encoded'] = data['severity'].map(severity_mapping).fillna(2)
# Categorical features
if 'category' in data.columns:
if 'category' not in self.label_encoders:
self.label_encoders['category'] = LabelEncoder()
features['category_encoded'] = self.label_encoders['category'].fit_transform(data['category'])
else:
features['category_encoded'] = self.label_encoders['category'].transform(data['category'])
# Impact features
if 'affected_users' in data.columns:
features['affected_users'] = data['affected_users']
features['affected_users_log'] = np.log1p(data['affected_users'])
# Time-based features
if 'created_at' in data.columns:
timestamp = pd.to_datetime(data['created_at'])
features['hour_of_day'] = timestamp.dt.hour
features['day_of_week'] = timestamp.dt.dayofweek
features['is_weekend'] = (timestamp.dt.dayofweek >= 5).astype(int)
features['is_business_hours'] = ((timestamp.dt.hour >= 9) & (timestamp.dt.hour <= 17)).astype(int)
# Historical features
if 'assigned_to' in data.columns:
# Average resolution time for assignee
features['assignee_avg_resolution_time'] = data.groupby('assigned_to')['resolution_time_hours'].transform('mean')
# Text features
if 'title' in data.columns:
features['title_length'] = data['title'].str.len()
if 'description' in data.columns:
features['description_length'] = data['description'].str.len()
return features
def fit(self, X: pd.DataFrame, y: pd.Series) -> Dict[str, float]:
"""Fit the resolution time prediction model"""
# Prepare features
X_processed = self.prepare_features(X)
self.feature_columns = X_processed.columns.tolist()
# Scale features
X_scaled = self.scaler.fit_transform(X_processed)
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(
X_scaled, y, test_size=0.2, random_state=42
)
# Fit model
self.model.fit(X_train, y_train)
# Evaluate model
y_pred = self.model.predict(X_val)
metrics = {
'mse': mean_squared_error(y_val, y_pred),
'rmse': np.sqrt(mean_squared_error(y_val, y_pred)),
'r2_score': r2_score(y_val, y_pred)
}
self.is_fitted = True
return metrics
def predict(self, X: pd.DataFrame) -> np.ndarray:
"""Predict resolution time in hours"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
X_processed = self.prepare_features(X)
X_scaled = self.scaler.transform(X_processed)
return self.model.predict(X_scaled)
def get_feature_importance(self) -> Dict[str, float]:
"""Get feature importance scores"""
if not self.is_fitted:
return {}
importance_scores = self.model.feature_importances_
return dict(zip(self.feature_columns, importance_scores))
class CostPredictionModel(BasePredictiveModel):
"""Model for predicting incident cost impact"""
def __init__(self, model_config: Dict[str, Any] = None):
super().__init__(model_config)
self.model = RandomForestRegressor(
n_estimators=self.model_config.get('n_estimators', 100),
max_depth=self.model_config.get('max_depth', 10),
random_state=42
)
def prepare_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Prepare features for cost prediction"""
features = pd.DataFrame()
# Severity features
if 'severity' in data.columns:
severity_mapping = {'LOW': 1, 'MEDIUM': 2, 'HIGH': 3, 'CRITICAL': 4, 'EMERGENCY': 5}
features['severity_encoded'] = data['severity'].map(severity_mapping).fillna(2)
# Impact features
if 'affected_users' in data.columns:
features['affected_users'] = data['affected_users']
features['affected_users_log'] = np.log1p(data['affected_users'])
if 'downtime_hours' in data.columns:
features['downtime_hours'] = data['downtime_hours']
features['downtime_hours_log'] = np.log1p(data['downtime_hours'])
# Categorical features
if 'category' in data.columns:
if 'category' not in self.label_encoders:
self.label_encoders['category'] = LabelEncoder()
features['category_encoded'] = self.label_encoders['category'].fit_transform(data['category'])
else:
features['category_encoded'] = self.label_encoders['category'].transform(data['category'])
# Business context
if 'business_unit' in data.columns:
if 'business_unit' not in self.label_encoders:
self.label_encoders['business_unit'] = LabelEncoder()
features['business_unit_encoded'] = self.label_encoders['business_unit'].fit_transform(data['business_unit'])
else:
features['business_unit_encoded'] = self.label_encoders['business_unit'].transform(data['business_unit'])
# Time-based features
if 'created_at' in data.columns:
timestamp = pd.to_datetime(data['created_at'])
features['hour_of_day'] = timestamp.dt.hour
features['day_of_week'] = timestamp.dt.dayofweek
features['is_weekend'] = (timestamp.dt.dayofweek >= 5).astype(int)
features['is_business_hours'] = ((timestamp.dt.hour >= 9) & (timestamp.dt.hour <= 17)).astype(int)
return features
def fit(self, X: pd.DataFrame, y: pd.Series) -> Dict[str, float]:
"""Fit the cost prediction model"""
# Prepare features
X_processed = self.prepare_features(X)
self.feature_columns = X_processed.columns.tolist()
# Scale features
X_scaled = self.scaler.fit_transform(X_processed)
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(
X_scaled, y, test_size=0.2, random_state=42
)
# Fit model
self.model.fit(X_train, y_train)
# Evaluate model
y_pred = self.model.predict(X_val)
metrics = {
'mse': mean_squared_error(y_val, y_pred),
'rmse': np.sqrt(mean_squared_error(y_val, y_pred)),
'r2_score': r2_score(y_val, y_pred)
}
self.is_fitted = True
return metrics
def predict(self, X: pd.DataFrame) -> np.ndarray:
"""Predict cost impact in USD"""
if not self.is_fitted:
raise ValueError("Model must be fitted before prediction")
X_processed = self.prepare_features(X)
X_scaled = self.scaler.transform(X_processed)
return self.model.predict(X_scaled)
def get_feature_importance(self) -> Dict[str, float]:
"""Get feature importance scores"""
if not self.is_fitted:
return {}
importance_scores = self.model.feature_importances_
return dict(zip(self.feature_columns, importance_scores))
class PredictiveModelFactory:
"""Factory for creating predictive models"""
@staticmethod
def create_model(model_type: str, model_config: Dict[str, Any] = None) -> BasePredictiveModel:
"""Create a predictive model instance"""
models = {
'INCIDENT_PREDICTION': IncidentPredictionModel,
'SEVERITY_PREDICTION': SeverityPredictionModel,
'RESOLUTION_TIME_PREDICTION': ResolutionTimePredictionModel,
'COST_PREDICTION': CostPredictionModel
}
if model_type not in models:
raise ValueError(f"Unknown model type: {model_type}")
return models[model_type](model_config)
class PredictiveModelService:
"""Service for managing predictive models"""
def __init__(self):
self.factory = PredictiveModelFactory()
def prepare_training_data(self, model_type: str, days_back: int = 90) -> Tuple[pd.DataFrame, pd.Series]:
"""Prepare training data for the specified model type"""
end_date = timezone.now()
start_date = end_date - timedelta(days=days_back)
# Get incidents from the time period
incidents = Incident.objects.filter(
created_at__gte=start_date,
created_at__lte=end_date
).values(
'id', 'title', 'description', 'severity', 'category', 'subcategory',
'affected_users', 'estimated_downtime', 'created_at', 'resolved_at',
'assigned_to', 'reporter', 'status'
)
if not incidents:
return pd.DataFrame(), pd.Series()
df = pd.DataFrame(list(incidents))
# Prepare target variable based on model type
if model_type == 'INCIDENT_PREDICTION':
# For incident prediction, we need to create time series data
# This is a simplified version - in practice, you'd need more sophisticated time series preparation
y = pd.Series([1] * len(df)) # Placeholder
elif model_type == 'SEVERITY_PREDICTION':
y = df['severity']
elif model_type == 'RESOLUTION_TIME_PREDICTION':
# Calculate resolution time in hours
df['resolved_at'] = pd.to_datetime(df['resolved_at'])
df['created_at'] = pd.to_datetime(df['created_at'])
df['resolution_time_hours'] = (df['resolved_at'] - df['created_at']).dt.total_seconds() / 3600
y = df['resolution_time_hours'].fillna(df['resolution_time_hours'].median())
elif model_type == 'COST_PREDICTION':
# Get cost data
cost_analyses = CostImpactAnalysis.objects.filter(
incident_id__in=df['id']
).values('incident_id', 'cost_amount')
cost_df = pd.DataFrame(list(cost_analyses))
if not cost_df.empty:
df = df.merge(cost_df, left_on='id', right_on='incident_id', how='left')
y = df['cost_amount'].fillna(df['cost_amount'].median())
else:
y = pd.Series([0] * len(df))
else:
raise ValueError(f"Unknown model type: {model_type}")
return df, y
def train_model(self, model_id: str) -> Dict[str, Any]:
"""Train a predictive model"""
try:
model = PredictiveModel.objects.get(id=model_id)
# Prepare training data
X, y = self.prepare_training_data(model.model_type, model.training_data_period_days)
if X.empty or len(y) < model.min_training_samples:
return {
'success': False,
'error': f'Insufficient training data. Need at least {model.min_training_samples} samples, got {len(y)}'
}
# Create model instance
ml_model = self.factory.create_model(model.model_type, model.model_config)
# Train the model
start_time = timezone.now()
metrics = ml_model.fit(X, y)
end_time = timezone.now()
# Update model with performance metrics
model.accuracy_score = metrics.get('accuracy', metrics.get('r2_score'))
model.precision_score = metrics.get('precision')
model.recall_score = metrics.get('recall')
model.f1_score = metrics.get('f1_score')
model.status = 'ACTIVE'
model.last_trained_at = end_time
model.training_duration_seconds = (end_time - start_time).total_seconds()
model.training_samples_count = len(y)
model.feature_columns = ml_model.feature_columns
# Save model (in a real implementation, you'd save the actual model file)
model.model_file_path = f"models/{model.id}_{model.version}.joblib"
model.save()
return {
'success': True,
'metrics': metrics,
'training_samples': len(y),
'training_duration': model.training_duration_seconds
}
except Exception as e:
logger.error(f"Error training model {model_id}: {str(e)}")
return {
'success': False,
'error': str(e)
}
def generate_predictions(self, model_id: str, prediction_horizon_hours: int = 24) -> List[Dict[str, Any]]:
"""Generate predictions using a trained model"""
try:
model = PredictiveModel.objects.get(id=model_id, status='ACTIVE')
# Create model instance
ml_model = self.factory.create_model(model.model_type, model.model_config)
# Load model (in a real implementation, you'd load from the saved file)
# For now, we'll create a mock prediction
# Prepare prediction data
X, _ = self.prepare_training_data(model.model_type, 7) # Last 7 days
if X.empty:
return []
# Make predictions
predictions = ml_model.predict(X.tail(10)) # Predict for last 10 incidents
# Create insight objects
insights = []
for i, prediction in enumerate(predictions):
insight_data = {
'model': model,
'insight_type': model.model_type,
'title': f"Prediction for {model.model_type.replace('_', ' ').title()}",
'description': f"Model predicts {prediction} for upcoming incidents",
'confidence_level': 'MEDIUM', # Could be calculated based on model confidence
'confidence_score': 0.7, # Placeholder
'predicted_value': {'value': float(prediction)},
'prediction_horizon': prediction_horizon_hours,
'prediction_date': timezone.now() + timedelta(hours=prediction_horizon_hours),
'input_features': X.iloc[i].to_dict(),
'supporting_evidence': [],
'affected_services': [X.iloc[i].get('category', 'Unknown')],
'recommendations': self._generate_recommendations(model.model_type, prediction),
'expires_at': timezone.now() + timedelta(hours=prediction_horizon_hours * 2)
}
insights.append(insight_data)
return insights
except Exception as e:
logger.error(f"Error generating predictions for model {model_id}: {str(e)}")
return []
def _generate_recommendations(self, model_type: str, prediction: Any) -> List[str]:
"""Generate recommendations based on prediction"""
recommendations = []
if model_type == 'INCIDENT_PREDICTION':
if prediction > 0.7:
recommendations.append("High probability of incident occurrence - consider proactive monitoring")
recommendations.append("Ensure on-call team is ready for potential incidents")
elif prediction > 0.4:
recommendations.append("Moderate probability of incident - monitor system metrics closely")
elif model_type == 'SEVERITY_PREDICTION':
if prediction in ['CRITICAL', 'EMERGENCY']:
recommendations.append("High severity incident predicted - prepare escalation procedures")
recommendations.append("Ensure senior staff are available for response")
elif prediction == 'HIGH':
recommendations.append("High severity incident predicted - review response procedures")
elif model_type == 'RESOLUTION_TIME_PREDICTION':
if prediction > 24:
recommendations.append("Long resolution time predicted - consider additional resources")
recommendations.append("Review escalation procedures for complex incidents")
elif prediction > 8:
recommendations.append("Extended resolution time predicted - prepare for extended response")
elif model_type == 'COST_PREDICTION':
if prediction > 10000:
recommendations.append("High cost impact predicted - prepare cost mitigation strategies")
recommendations.append("Consider business continuity measures")
elif prediction > 5000:
recommendations.append("Significant cost impact predicted - review cost control measures")
return recommendations

View File

@@ -0,0 +1,828 @@
"""
Analytics & Predictive Insights models for Enterprise Incident Management API
Implements advanced KPIs, predictive analytics, ML-based anomaly detection, and cost analysis
"""
import uuid
import json
from datetime import datetime, timedelta, time
from typing import Dict, Any, Optional, List
from decimal import Decimal
from django.db import models
from django.contrib.auth import get_user_model
from django.core.validators import MinValueValidator, MaxValueValidator
from django.utils import timezone
from django.core.exceptions import ValidationError
User = get_user_model()
class KPIMetric(models.Model):
"""Base model for KPI metrics tracking"""
METRIC_TYPES = [
('MTTA', 'Mean Time to Acknowledge'),
('MTTR', 'Mean Time to Resolve'),
('MTBF', 'Mean Time Between Failures'),
('MTBSI', 'Mean Time Between Service Incidents'),
('AVAILABILITY', 'Service Availability'),
('INCIDENT_COUNT', 'Incident Count'),
('RESOLUTION_RATE', 'Resolution Rate'),
('ESCALATION_RATE', 'Escalation Rate'),
('CUSTOM', 'Custom Metric'),
]
AGGREGATION_TYPES = [
('AVERAGE', 'Average'),
('MEDIAN', 'Median'),
('MIN', 'Minimum'),
('MAX', 'Maximum'),
('SUM', 'Sum'),
('COUNT', 'Count'),
('PERCENTILE_95', '95th Percentile'),
('PERCENTILE_99', '99th Percentile'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200)
description = models.TextField()
metric_type = models.CharField(max_length=20, choices=METRIC_TYPES)
aggregation_type = models.CharField(max_length=20, choices=AGGREGATION_TYPES)
# Targeting criteria
incident_categories = models.JSONField(
default=list,
help_text="List of incident categories this metric applies to"
)
incident_severities = models.JSONField(
default=list,
help_text="List of incident severities this metric applies to"
)
incident_priorities = models.JSONField(
default=list,
help_text="List of incident priorities this metric applies to"
)
# Calculation configuration
calculation_formula = models.TextField(
blank=True,
null=True,
help_text="Custom calculation formula for complex metrics"
)
time_window_hours = models.PositiveIntegerField(
default=24,
help_text="Time window for metric calculation in hours"
)
# Status and metadata
is_active = models.BooleanField(default=True)
is_system_metric = models.BooleanField(
default=False,
help_text="Whether this is a system-defined metric"
)
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['metric_type', 'is_active']),
models.Index(fields=['incident_categories']),
models.Index(fields=['incident_severities']),
]
def __str__(self):
return f"{self.name} ({self.metric_type})"
class KPIMeasurement(models.Model):
"""Individual measurements of KPI metrics"""
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
metric = models.ForeignKey(KPIMetric, on_delete=models.CASCADE, related_name='measurements')
# Measurement details
value = models.DecimalField(max_digits=15, decimal_places=4)
unit = models.CharField(max_length=50, help_text="Unit of measurement (minutes, hours, percentage, etc.)")
# Time period
measurement_period_start = models.DateTimeField()
measurement_period_end = models.DateTimeField()
# Context
incident_count = models.PositiveIntegerField(
default=0,
help_text="Number of incidents included in this measurement"
)
sample_size = models.PositiveIntegerField(
default=0,
help_text="Total sample size for this measurement"
)
# Additional metadata
metadata = models.JSONField(
default=dict,
help_text="Additional metadata for this measurement"
)
# Timestamps
calculated_at = models.DateTimeField(auto_now_add=True)
class Meta:
ordering = ['-calculated_at']
indexes = [
models.Index(fields=['metric', 'measurement_period_start']),
models.Index(fields=['calculated_at']),
]
def __str__(self):
return f"{self.metric.name}: {self.value} {self.unit}"
class IncidentRecurrenceAnalysis(models.Model):
"""Analysis of incident recurrence patterns"""
RECURRENCE_TYPES = [
('EXACT_DUPLICATE', 'Exact Duplicate'),
('SIMILAR_PATTERN', 'Similar Pattern'),
('SEASONAL', 'Seasonal Recurrence'),
('TREND', 'Trend-based Recurrence'),
('CASCADE', 'Cascade Effect'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
# Related incidents
primary_incident = models.ForeignKey(
'incident_intelligence.Incident',
on_delete=models.CASCADE,
related_name='recurrence_analyses_as_primary'
)
recurring_incidents = models.ManyToManyField(
'incident_intelligence.Incident',
related_name='recurrence_analyses_as_recurring'
)
# Analysis details
recurrence_type = models.CharField(max_length=20, choices=RECURRENCE_TYPES)
confidence_score = models.FloatField(
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)]
)
recurrence_rate = models.FloatField(
help_text="Rate of recurrence (incidents per time period)"
)
# Pattern characteristics
common_keywords = models.JSONField(
default=list,
help_text="Common keywords across recurring incidents"
)
common_categories = models.JSONField(
default=list,
help_text="Common categories across recurring incidents"
)
time_pattern = models.JSONField(
default=dict,
help_text="Time-based pattern analysis"
)
# Impact analysis
total_affected_users = models.PositiveIntegerField(default=0)
total_downtime_hours = models.DecimalField(max_digits=10, decimal_places=2, default=0)
estimated_cost_impact = models.DecimalField(max_digits=15, decimal_places=2, default=0)
# Recommendations
prevention_recommendations = models.JSONField(
default=list,
help_text="AI-generated recommendations to prevent recurrence"
)
automation_opportunities = models.JSONField(
default=list,
help_text="Potential automation opportunities identified"
)
# Automation integration
suggested_runbooks = models.ManyToManyField(
'automation_orchestration.Runbook',
blank=True,
related_name='recurrence_analyses',
help_text="Runbooks suggested to prevent recurrence"
)
# Status
is_resolved = models.BooleanField(default=False)
resolution_actions = models.JSONField(
default=list,
help_text="Actions taken to resolve the recurrence pattern"
)
# Metadata
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
model_version = models.CharField(max_length=50, default='v1.0')
class Meta:
ordering = ['-confidence_score', '-created_at']
indexes = [
models.Index(fields=['recurrence_type', 'confidence_score']),
models.Index(fields=['is_resolved']),
]
def __str__(self):
return f"Recurrence Analysis: {self.primary_incident.title} ({self.recurrence_type})"
class PredictiveModel(models.Model):
"""ML models for predictive analytics"""
MODEL_TYPES = [
('ANOMALY_DETECTION', 'Anomaly Detection'),
('INCIDENT_PREDICTION', 'Incident Prediction'),
('SEVERITY_PREDICTION', 'Severity Prediction'),
('RESOLUTION_TIME_PREDICTION', 'Resolution Time Prediction'),
('ESCALATION_PREDICTION', 'Escalation Prediction'),
('COST_PREDICTION', 'Cost Impact Prediction'),
]
ALGORITHM_TYPES = [
('ISOLATION_FOREST', 'Isolation Forest'),
('LSTM', 'Long Short-Term Memory'),
('RANDOM_FOREST', 'Random Forest'),
('XGBOOST', 'XGBoost'),
('SVM', 'Support Vector Machine'),
('NEURAL_NETWORK', 'Neural Network'),
('ARIMA', 'ARIMA'),
('PROPHET', 'Prophet'),
]
STATUS_CHOICES = [
('TRAINING', 'Training'),
('ACTIVE', 'Active'),
('INACTIVE', 'Inactive'),
('RETRAINING', 'Retraining'),
('ERROR', 'Error'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200)
description = models.TextField()
model_type = models.CharField(max_length=30, choices=MODEL_TYPES)
algorithm_type = models.CharField(max_length=20, choices=ALGORITHM_TYPES)
# Model configuration
model_config = models.JSONField(
default=dict,
help_text="Model-specific configuration parameters"
)
feature_columns = models.JSONField(
default=list,
help_text="List of feature columns used by the model"
)
target_column = models.CharField(
max_length=100,
help_text="Target column for prediction"
)
# Training data
training_data_period_days = models.PositiveIntegerField(
default=90,
help_text="Number of days of training data to use"
)
min_training_samples = models.PositiveIntegerField(
default=100,
help_text="Minimum number of samples required for training"
)
# Performance metrics
accuracy_score = models.FloatField(
null=True, blank=True,
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)]
)
precision_score = models.FloatField(
null=True, blank=True,
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)]
)
recall_score = models.FloatField(
null=True, blank=True,
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)]
)
f1_score = models.FloatField(
null=True, blank=True,
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)]
)
# Status and metadata
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='TRAINING')
version = models.CharField(max_length=20, default='1.0')
model_file_path = models.CharField(
max_length=500,
blank=True,
null=True,
help_text="Path to the trained model file"
)
# Training metadata
last_trained_at = models.DateTimeField(null=True, blank=True)
training_duration_seconds = models.PositiveIntegerField(null=True, blank=True)
training_samples_count = models.PositiveIntegerField(null=True, blank=True)
# Retraining configuration
auto_retrain_enabled = models.BooleanField(default=True)
retrain_frequency_days = models.PositiveIntegerField(default=7)
performance_threshold = models.FloatField(
default=0.8,
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)],
help_text="Performance threshold below which model should be retrained"
)
# Metadata
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['-created_at']
indexes = [
models.Index(fields=['model_type', 'status']),
models.Index(fields=['algorithm_type']),
models.Index(fields=['status']),
]
def __str__(self):
return f"{self.name} ({self.model_type})"
class AnomalyDetection(models.Model):
"""Anomaly detection results and alerts"""
ANOMALY_TYPES = [
('STATISTICAL', 'Statistical Anomaly'),
('TEMPORAL', 'Temporal Anomaly'),
('PATTERN', 'Pattern Anomaly'),
('THRESHOLD', 'Threshold Breach'),
('BEHAVIORAL', 'Behavioral Anomaly'),
]
SEVERITY_CHOICES = [
('LOW', 'Low'),
('MEDIUM', 'Medium'),
('HIGH', 'High'),
('CRITICAL', 'Critical'),
]
STATUS_CHOICES = [
('DETECTED', 'Detected'),
('INVESTIGATING', 'Investigating'),
('CONFIRMED', 'Confirmed'),
('FALSE_POSITIVE', 'False Positive'),
('RESOLVED', 'Resolved'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
model = models.ForeignKey(PredictiveModel, on_delete=models.CASCADE, related_name='anomaly_detections')
# Anomaly details
anomaly_type = models.CharField(max_length=20, choices=ANOMALY_TYPES)
severity = models.CharField(max_length=20, choices=SEVERITY_CHOICES)
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='DETECTED')
# Detection details
confidence_score = models.FloatField(
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)]
)
anomaly_score = models.FloatField(
help_text="Raw anomaly score from the model"
)
threshold_used = models.FloatField(
help_text="Threshold used for anomaly detection"
)
# Context
detected_at = models.DateTimeField(auto_now_add=True)
time_window_start = models.DateTimeField()
time_window_end = models.DateTimeField()
# Related data
related_incidents = models.ManyToManyField(
'incident_intelligence.Incident',
blank=True,
related_name='anomaly_detections'
)
affected_services = models.JSONField(
default=list,
help_text="Services affected by this anomaly"
)
affected_metrics = models.JSONField(
default=list,
help_text="Metrics that showed anomalous behavior"
)
# Analysis
description = models.TextField(help_text="Description of the anomaly")
root_cause_analysis = models.TextField(
blank=True,
null=True,
help_text="Root cause analysis of the anomaly"
)
impact_assessment = models.TextField(
blank=True,
null=True,
help_text="Assessment of the anomaly's impact"
)
# Actions taken
actions_taken = models.JSONField(
default=list,
help_text="Actions taken in response to the anomaly"
)
resolved_at = models.DateTimeField(null=True, blank=True)
resolved_by = models.ForeignKey(
User,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='resolved_anomalies'
)
# Metadata
metadata = models.JSONField(
default=dict,
help_text="Additional metadata for this anomaly"
)
class Meta:
ordering = ['-detected_at']
indexes = [
models.Index(fields=['anomaly_type', 'severity']),
models.Index(fields=['status', 'detected_at']),
models.Index(fields=['confidence_score']),
]
def __str__(self):
return f"Anomaly: {self.anomaly_type} - {self.severity} ({self.detected_at})"
class CostImpactAnalysis(models.Model):
"""Cost impact analysis for incidents and downtime"""
COST_TYPES = [
('DOWNTIME', 'Downtime Cost'),
('LOST_REVENUE', 'Lost Revenue'),
('PENALTY', 'Penalty Cost'),
('RESOURCE_COST', 'Resource Cost'),
('REPUTATION_COST', 'Reputation Cost'),
('COMPLIANCE_COST', 'Compliance Cost'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
# Related incident
incident = models.ForeignKey(
'incident_intelligence.Incident',
on_delete=models.CASCADE,
related_name='cost_analyses'
)
# SLA integration
sla_instance = models.ForeignKey(
'sla_oncall.SLAInstance',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='cost_analyses',
help_text="Related SLA instance for cost calculation"
)
# Cost breakdown
cost_type = models.CharField(max_length=20, choices=COST_TYPES)
cost_amount = models.DecimalField(
max_digits=15,
decimal_places=2,
help_text="Cost amount in USD"
)
currency = models.CharField(max_length=3, default='USD')
# Cost calculation details
calculation_method = models.CharField(
max_length=50,
help_text="Method used to calculate the cost"
)
calculation_details = models.JSONField(
default=dict,
help_text="Detailed breakdown of cost calculation"
)
# Impact metrics
downtime_hours = models.DecimalField(
max_digits=10,
decimal_places=2,
null=True,
blank=True,
help_text="Total downtime in hours"
)
affected_users = models.PositiveIntegerField(
null=True,
blank=True,
help_text="Number of users affected"
)
revenue_impact = models.DecimalField(
max_digits=15,
decimal_places=2,
null=True,
blank=True,
help_text="Revenue impact in USD"
)
# Business context
business_unit = models.CharField(
max_length=100,
blank=True,
null=True,
help_text="Business unit affected"
)
service_tier = models.CharField(
max_length=50,
blank=True,
null=True,
help_text="Service tier (e.g., Premium, Standard)"
)
# Validation and approval
is_validated = models.BooleanField(default=False)
validated_by = models.ForeignKey(
User,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='validated_cost_analyses'
)
validated_at = models.DateTimeField(null=True, blank=True)
validation_notes = models.TextField(blank=True, null=True)
# Metadata
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['-created_at']
indexes = [
models.Index(fields=['incident', 'cost_type']),
models.Index(fields=['cost_amount']),
models.Index(fields=['is_validated']),
]
def __str__(self):
return f"Cost Analysis: {self.incident.title} - {self.cost_type} (${self.cost_amount})"
class DashboardConfiguration(models.Model):
"""Dashboard configuration for analytics visualization"""
DASHBOARD_TYPES = [
('EXECUTIVE', 'Executive Dashboard'),
('OPERATIONAL', 'Operational Dashboard'),
('TECHNICAL', 'Technical Dashboard'),
('CUSTOM', 'Custom Dashboard'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200)
description = models.TextField()
dashboard_type = models.CharField(max_length=20, choices=DASHBOARD_TYPES)
# Dashboard configuration
layout_config = models.JSONField(
default=dict,
help_text="Dashboard layout configuration"
)
widget_configs = models.JSONField(
default=list,
help_text="Configuration for dashboard widgets"
)
# Access control
is_public = models.BooleanField(default=False)
allowed_users = models.ManyToManyField(
User,
blank=True,
related_name='accessible_dashboards'
)
allowed_roles = models.JSONField(
default=list,
help_text="List of roles that can access this dashboard"
)
# Refresh configuration
auto_refresh_enabled = models.BooleanField(default=True)
refresh_interval_seconds = models.PositiveIntegerField(default=300)
# Status and metadata
is_active = models.BooleanField(default=True)
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['dashboard_type', 'is_active']),
models.Index(fields=['is_public']),
]
def __str__(self):
return f"{self.name} ({self.dashboard_type})"
class HeatmapData(models.Model):
"""Heatmap data for visualization"""
HEATMAP_TYPES = [
('INCIDENT_FREQUENCY', 'Incident Frequency'),
('RESOLUTION_TIME', 'Resolution Time'),
('COST_IMPACT', 'Cost Impact'),
('ANOMALY_DENSITY', 'Anomaly Density'),
('SLA_PERFORMANCE', 'SLA Performance'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200)
heatmap_type = models.CharField(max_length=20, choices=HEATMAP_TYPES)
# Data configuration
time_period_start = models.DateTimeField()
time_period_end = models.DateTimeField()
time_granularity = models.CharField(
max_length=20,
choices=[
('HOUR', 'Hour'),
('DAY', 'Day'),
('WEEK', 'Week'),
('MONTH', 'Month'),
]
)
# Heatmap data
data_points = models.JSONField(
help_text="Heatmap data points with coordinates and values"
)
color_scheme = models.CharField(
max_length=50,
default='viridis',
help_text="Color scheme for the heatmap"
)
# Aggregation settings
aggregation_method = models.CharField(
max_length=20,
choices=[
('SUM', 'Sum'),
('AVERAGE', 'Average'),
('COUNT', 'Count'),
('MAX', 'Maximum'),
('MIN', 'Minimum'),
]
)
# Metadata
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['-created_at']
indexes = [
models.Index(fields=['heatmap_type', 'time_period_start']),
models.Index(fields=['time_granularity']),
]
def __str__(self):
return f"Heatmap: {self.name} ({self.heatmap_type})"
class PredictiveInsight(models.Model):
"""Predictive insights generated by ML models"""
INSIGHT_TYPES = [
('INCIDENT_PREDICTION', 'Incident Prediction'),
('SEVERITY_PREDICTION', 'Severity Prediction'),
('RESOLUTION_TIME_PREDICTION', 'Resolution Time Prediction'),
('COST_PREDICTION', 'Cost Prediction'),
('TREND_ANALYSIS', 'Trend Analysis'),
('PATTERN_DETECTION', 'Pattern Detection'),
]
CONFIDENCE_LEVELS = [
('LOW', 'Low Confidence'),
('MEDIUM', 'Medium Confidence'),
('HIGH', 'High Confidence'),
('VERY_HIGH', 'Very High Confidence'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
model = models.ForeignKey(PredictiveModel, on_delete=models.CASCADE, related_name='insights')
# Security integration
data_classification = models.ForeignKey(
'security.DataClassification',
on_delete=models.SET_NULL,
null=True,
blank=True,
help_text="Data classification level for this insight"
)
# Insight details
insight_type = models.CharField(max_length=30, choices=INSIGHT_TYPES)
title = models.CharField(max_length=200)
description = models.TextField()
confidence_level = models.CharField(max_length=20, choices=CONFIDENCE_LEVELS)
confidence_score = models.FloatField(
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)]
)
# Prediction details
predicted_value = models.JSONField(
help_text="Predicted value or values"
)
prediction_horizon = models.PositiveIntegerField(
help_text="Prediction horizon in hours"
)
prediction_date = models.DateTimeField(
help_text="When the prediction is for"
)
# Context
input_features = models.JSONField(
help_text="Input features used for the prediction"
)
supporting_evidence = models.JSONField(
default=list,
help_text="Supporting evidence for the prediction"
)
# Related data
related_incidents = models.ManyToManyField(
'incident_intelligence.Incident',
blank=True,
related_name='predictive_insights'
)
affected_services = models.JSONField(
default=list,
help_text="Services that may be affected"
)
# Recommendations
recommendations = models.JSONField(
default=list,
help_text="AI-generated recommendations based on the insight"
)
risk_assessment = models.TextField(
blank=True,
null=True,
help_text="Risk assessment based on the prediction"
)
# Status
is_acknowledged = models.BooleanField(default=False)
acknowledged_by = models.ForeignKey(
User,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='acknowledged_insights'
)
acknowledged_at = models.DateTimeField(null=True, blank=True)
# Validation
is_validated = models.BooleanField(default=False)
actual_value = models.JSONField(
null=True,
blank=True,
help_text="Actual value when prediction is validated"
)
validation_accuracy = models.FloatField(
null=True,
blank=True,
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)]
)
# Metadata
generated_at = models.DateTimeField(auto_now_add=True)
expires_at = models.DateTimeField(
help_text="When this insight expires"
)
class Meta:
ordering = ['-generated_at']
indexes = [
models.Index(fields=['insight_type', 'confidence_score']),
models.Index(fields=['prediction_date']),
models.Index(fields=['is_acknowledged']),
]
def __str__(self):
return f"Insight: {self.title} ({self.insight_type})"
@property
def is_expired(self):
"""Check if this insight has expired"""
return timezone.now() > self.expires_at

View File

@@ -0,0 +1 @@
# Analytics & Predictive Insights serializers

View File

@@ -0,0 +1,404 @@
"""
Analytics & Predictive Insights serializers for Enterprise Incident Management API
Provides comprehensive serialization for KPIs, metrics, predictive models, and insights
"""
from rest_framework import serializers
from django.contrib.auth import get_user_model
from ..models import (
KPIMetric, KPIMeasurement, IncidentRecurrenceAnalysis, PredictiveModel,
AnomalyDetection, CostImpactAnalysis, DashboardConfiguration,
HeatmapData, PredictiveInsight
)
User = get_user_model()
class KPIMetricSerializer(serializers.ModelSerializer):
"""Serializer for KPI metrics"""
created_by_username = serializers.CharField(source='created_by.username', read_only=True)
measurement_count = serializers.SerializerMethodField()
latest_measurement = serializers.SerializerMethodField()
class Meta:
model = KPIMetric
fields = [
'id', 'name', 'description', 'metric_type', 'aggregation_type',
'incident_categories', 'incident_severities', 'incident_priorities',
'calculation_formula', 'time_window_hours', 'is_active',
'is_system_metric', 'created_by_username', 'created_at', 'updated_at',
'measurement_count', 'latest_measurement'
]
read_only_fields = ['id', 'created_at', 'updated_at']
def get_measurement_count(self, obj):
"""Get the number of measurements for this metric"""
return obj.measurements.count()
def get_latest_measurement(self, obj):
"""Get the latest measurement for this metric"""
latest = obj.measurements.first()
if latest:
return {
'value': str(latest.value),
'unit': latest.unit,
'calculated_at': latest.calculated_at,
'incident_count': latest.incident_count
}
return None
class KPIMeasurementSerializer(serializers.ModelSerializer):
"""Serializer for KPI measurements"""
metric_name = serializers.CharField(source='metric.name', read_only=True)
metric_type = serializers.CharField(source='metric.metric_type', read_only=True)
class Meta:
model = KPIMeasurement
fields = [
'id', 'metric', 'metric_name', 'metric_type', 'value', 'unit',
'measurement_period_start', 'measurement_period_end',
'incident_count', 'sample_size', 'metadata', 'calculated_at'
]
read_only_fields = ['id', 'calculated_at']
class IncidentRecurrenceAnalysisSerializer(serializers.ModelSerializer):
"""Serializer for incident recurrence analysis"""
primary_incident_title = serializers.CharField(source='primary_incident.title', read_only=True)
primary_incident_severity = serializers.CharField(source='primary_incident.severity', read_only=True)
recurring_incident_count = serializers.SerializerMethodField()
recurring_incident_titles = serializers.SerializerMethodField()
class Meta:
model = IncidentRecurrenceAnalysis
fields = [
'id', 'primary_incident', 'primary_incident_title', 'primary_incident_severity',
'recurring_incidents', 'recurring_incident_count', 'recurring_incident_titles',
'recurrence_type', 'confidence_score', 'recurrence_rate',
'common_keywords', 'common_categories', 'time_pattern',
'total_affected_users', 'total_downtime_hours', 'estimated_cost_impact',
'prevention_recommendations', 'automation_opportunities',
'is_resolved', 'resolution_actions', 'created_at', 'updated_at', 'model_version'
]
read_only_fields = ['id', 'created_at', 'updated_at']
def get_recurring_incident_count(self, obj):
"""Get the number of recurring incidents"""
return obj.recurring_incidents.count()
def get_recurring_incident_titles(self, obj):
"""Get titles of recurring incidents"""
return [incident.title for incident in obj.recurring_incidents.all()]
class PredictiveModelSerializer(serializers.ModelSerializer):
"""Serializer for predictive models"""
created_by_username = serializers.CharField(source='created_by.username', read_only=True)
insight_count = serializers.SerializerMethodField()
anomaly_detection_count = serializers.SerializerMethodField()
performance_summary = serializers.SerializerMethodField()
class Meta:
model = PredictiveModel
fields = [
'id', 'name', 'description', 'model_type', 'algorithm_type',
'model_config', 'feature_columns', 'target_column',
'training_data_period_days', 'min_training_samples',
'accuracy_score', 'precision_score', 'recall_score', 'f1_score',
'status', 'version', 'model_file_path',
'last_trained_at', 'training_duration_seconds', 'training_samples_count',
'auto_retrain_enabled', 'retrain_frequency_days', 'performance_threshold',
'created_by_username', 'created_at', 'updated_at',
'insight_count', 'anomaly_detection_count', 'performance_summary'
]
read_only_fields = ['id', 'created_at', 'updated_at']
def get_insight_count(self, obj):
"""Get the number of insights generated by this model"""
return obj.insights.count()
def get_anomaly_detection_count(self, obj):
"""Get the number of anomaly detections by this model"""
return obj.anomaly_detections.count()
def get_performance_summary(self, obj):
"""Get a summary of model performance metrics"""
return {
'accuracy': obj.accuracy_score,
'precision': obj.precision_score,
'recall': obj.recall_score,
'f1_score': obj.f1_score,
'overall_health': self._calculate_overall_health(obj)
}
def _calculate_overall_health(self, obj):
"""Calculate overall model health score"""
if not all([obj.accuracy_score, obj.precision_score, obj.recall_score, obj.f1_score]):
return 'Unknown'
avg_score = (obj.accuracy_score + obj.precision_score + obj.recall_score + obj.f1_score) / 4
if avg_score >= 0.9:
return 'Excellent'
elif avg_score >= 0.8:
return 'Good'
elif avg_score >= 0.7:
return 'Fair'
else:
return 'Poor'
class AnomalyDetectionSerializer(serializers.ModelSerializer):
"""Serializer for anomaly detection results"""
model_name = serializers.CharField(source='model.name', read_only=True)
model_type = serializers.CharField(source='model.model_type', read_only=True)
resolved_by_username = serializers.CharField(source='resolved_by.username', read_only=True)
related_incident_count = serializers.SerializerMethodField()
related_incident_titles = serializers.SerializerMethodField()
time_since_detection = serializers.SerializerMethodField()
class Meta:
model = AnomalyDetection
fields = [
'id', 'model', 'model_name', 'model_type',
'anomaly_type', 'severity', 'status',
'confidence_score', 'anomaly_score', 'threshold_used',
'detected_at', 'time_window_start', 'time_window_end',
'related_incidents', 'related_incident_count', 'related_incident_titles',
'affected_services', 'affected_metrics',
'description', 'root_cause_analysis', 'impact_assessment',
'actions_taken', 'resolved_at', 'resolved_by', 'resolved_by_username',
'metadata', 'time_since_detection'
]
read_only_fields = ['id', 'detected_at']
def get_related_incident_count(self, obj):
"""Get the number of related incidents"""
return obj.related_incidents.count()
def get_related_incident_titles(self, obj):
"""Get titles of related incidents"""
return [incident.title for incident in obj.related_incidents.all()]
def get_time_since_detection(self, obj):
"""Get time elapsed since anomaly detection"""
from django.utils import timezone
return timezone.now() - obj.detected_at
class CostImpactAnalysisSerializer(serializers.ModelSerializer):
"""Serializer for cost impact analysis"""
incident_title = serializers.CharField(source='incident.title', read_only=True)
incident_severity = serializers.CharField(source='incident.severity', read_only=True)
validated_by_username = serializers.CharField(source='validated_by.username', read_only=True)
cost_per_hour = serializers.SerializerMethodField()
cost_per_user = serializers.SerializerMethodField()
class Meta:
model = CostImpactAnalysis
fields = [
'id', 'incident', 'incident_title', 'incident_severity',
'cost_type', 'cost_amount', 'currency',
'calculation_method', 'calculation_details',
'downtime_hours', 'affected_users', 'revenue_impact',
'business_unit', 'service_tier',
'is_validated', 'validated_by', 'validated_by_username',
'validated_at', 'validation_notes',
'created_at', 'updated_at',
'cost_per_hour', 'cost_per_user'
]
read_only_fields = ['id', 'created_at', 'updated_at']
def get_cost_per_hour(self, obj):
"""Calculate cost per hour of downtime"""
if obj.downtime_hours and obj.downtime_hours > 0:
return float(obj.cost_amount / obj.downtime_hours)
return None
def get_cost_per_user(self, obj):
"""Calculate cost per affected user"""
if obj.affected_users and obj.affected_users > 0:
return float(obj.cost_amount / obj.affected_users)
return None
class DashboardConfigurationSerializer(serializers.ModelSerializer):
"""Serializer for dashboard configurations"""
created_by_username = serializers.CharField(source='created_by.username', read_only=True)
allowed_user_count = serializers.SerializerMethodField()
allowed_usernames = serializers.SerializerMethodField()
widget_count = serializers.SerializerMethodField()
class Meta:
model = DashboardConfiguration
fields = [
'id', 'name', 'description', 'dashboard_type',
'layout_config', 'widget_configs', 'widget_count',
'is_public', 'allowed_users', 'allowed_user_count', 'allowed_usernames',
'allowed_roles', 'auto_refresh_enabled', 'refresh_interval_seconds',
'is_active', 'created_by_username', 'created_at', 'updated_at'
]
read_only_fields = ['id', 'created_at', 'updated_at']
def get_allowed_user_count(self, obj):
"""Get the number of allowed users"""
return obj.allowed_users.count()
def get_allowed_usernames(self, obj):
"""Get usernames of allowed users"""
return [user.username for user in obj.allowed_users.all()]
def get_widget_count(self, obj):
"""Get the number of widgets in this dashboard"""
return len(obj.widget_configs) if obj.widget_configs else 0
class HeatmapDataSerializer(serializers.ModelSerializer):
"""Serializer for heatmap data"""
data_point_count = serializers.SerializerMethodField()
time_span_hours = serializers.SerializerMethodField()
class Meta:
model = HeatmapData
fields = [
'id', 'name', 'heatmap_type', 'time_period_start', 'time_period_end',
'time_granularity', 'data_points', 'data_point_count',
'color_scheme', 'aggregation_method', 'time_span_hours',
'created_at', 'updated_at'
]
read_only_fields = ['id', 'created_at', 'updated_at']
def get_data_point_count(self, obj):
"""Get the number of data points"""
return len(obj.data_points) if obj.data_points else 0
def get_time_span_hours(self, obj):
"""Get the time span in hours"""
from django.utils import timezone
delta = obj.time_period_end - obj.time_period_start
return delta.total_seconds() / 3600
class PredictiveInsightSerializer(serializers.ModelSerializer):
"""Serializer for predictive insights"""
model_name = serializers.CharField(source='model.name', read_only=True)
model_type = serializers.CharField(source='model.model_type', read_only=True)
acknowledged_by_username = serializers.CharField(source='acknowledged_by.username', read_only=True)
related_incident_count = serializers.SerializerMethodField()
related_incident_titles = serializers.SerializerMethodField()
time_until_expiry = serializers.SerializerMethodField()
is_expired = serializers.SerializerMethodField()
class Meta:
model = PredictiveInsight
fields = [
'id', 'model', 'model_name', 'model_type',
'insight_type', 'title', 'description',
'confidence_level', 'confidence_score',
'predicted_value', 'prediction_horizon', 'prediction_date',
'input_features', 'supporting_evidence',
'related_incidents', 'related_incident_count', 'related_incident_titles',
'affected_services', 'recommendations', 'risk_assessment',
'is_acknowledged', 'acknowledged_by', 'acknowledged_by_username',
'acknowledged_at', 'is_validated', 'actual_value', 'validation_accuracy',
'generated_at', 'expires_at', 'time_until_expiry', 'is_expired'
]
read_only_fields = ['id', 'generated_at']
def get_related_incident_count(self, obj):
"""Get the number of related incidents"""
return obj.related_incidents.count()
def get_related_incident_titles(self, obj):
"""Get titles of related incidents"""
return [incident.title for incident in obj.related_incidents.all()]
def get_time_until_expiry(self, obj):
"""Get time until insight expires"""
from django.utils import timezone
return obj.expires_at - timezone.now()
def get_is_expired(self, obj):
"""Check if insight has expired"""
return obj.is_expired
# Summary and aggregation serializers
class KPISummarySerializer(serializers.Serializer):
"""Serializer for KPI summary data"""
metric_type = serializers.CharField()
metric_name = serializers.CharField()
current_value = serializers.DecimalField(max_digits=15, decimal_places=4)
unit = serializers.CharField()
trend = serializers.CharField() # 'up', 'down', 'stable'
trend_percentage = serializers.DecimalField(max_digits=5, decimal_places=2)
period_start = serializers.DateTimeField()
period_end = serializers.DateTimeField()
incident_count = serializers.IntegerField()
target_value = serializers.DecimalField(max_digits=15, decimal_places=4, allow_null=True)
target_met = serializers.BooleanField()
class AnomalySummarySerializer(serializers.Serializer):
"""Serializer for anomaly summary data"""
total_anomalies = serializers.IntegerField()
critical_anomalies = serializers.IntegerField()
high_anomalies = serializers.IntegerField()
medium_anomalies = serializers.IntegerField()
low_anomalies = serializers.IntegerField()
unresolved_anomalies = serializers.IntegerField()
false_positive_rate = serializers.DecimalField(max_digits=5, decimal_places=2)
average_resolution_time = serializers.DurationField()
class CostSummarySerializer(serializers.Serializer):
"""Serializer for cost summary data"""
total_cost = serializers.DecimalField(max_digits=15, decimal_places=2)
currency = serializers.CharField()
downtime_cost = serializers.DecimalField(max_digits=15, decimal_places=2)
lost_revenue = serializers.DecimalField(max_digits=15, decimal_places=2)
penalty_cost = serializers.DecimalField(max_digits=15, decimal_places=2)
resource_cost = serializers.DecimalField(max_digits=15, decimal_places=2)
total_downtime_hours = serializers.DecimalField(max_digits=10, decimal_places=2)
total_affected_users = serializers.IntegerField()
cost_per_hour = serializers.DecimalField(max_digits=10, decimal_places=2)
cost_per_user = serializers.DecimalField(max_digits=10, decimal_places=2)
class PredictiveInsightSummarySerializer(serializers.Serializer):
"""Serializer for predictive insight summary data"""
total_insights = serializers.IntegerField()
high_confidence_insights = serializers.IntegerField()
medium_confidence_insights = serializers.IntegerField()
low_confidence_insights = serializers.IntegerField()
acknowledged_insights = serializers.IntegerField()
validated_insights = serializers.IntegerField()
expired_insights = serializers.IntegerField()
average_accuracy = serializers.DecimalField(max_digits=5, decimal_places=2)
active_models = serializers.IntegerField()
class DashboardDataSerializer(serializers.Serializer):
"""Serializer for complete dashboard data"""
kpi_summary = KPISummarySerializer(many=True)
anomaly_summary = AnomalySummarySerializer()
cost_summary = CostSummarySerializer()
insight_summary = PredictiveInsightSummarySerializer()
recent_anomalies = AnomalyDetectionSerializer(many=True)
recent_insights = PredictiveInsightSerializer(many=True)
heatmap_data = HeatmapDataSerializer(many=True)
last_updated = serializers.DateTimeField()

View File

@@ -0,0 +1,238 @@
"""
Signals for analytics_predictive_insights app
Handles automatic KPI calculations and analytics updates
"""
from django.db.models.signals import post_save, post_delete
from django.dispatch import receiver
from django.utils import timezone
from django.db import models
from datetime import timedelta
import logging
from incident_intelligence.models import Incident
from .models import KPIMetric, KPIMeasurement, CostImpactAnalysis
from .ml.anomaly_detection import AnomalyDetectionService
from .ml.predictive_models import PredictiveModelService
logger = logging.getLogger(__name__)
@receiver(post_save, sender=Incident)
def update_kpi_measurements_on_incident_change(sender, instance, created, **kwargs):
"""Update KPI measurements when incidents are created or updated"""
try:
# Only process if incident is resolved or status changed
if not created and not instance.is_resolved:
return
# Get active KPI metrics that apply to this incident
applicable_metrics = KPIMetric.objects.filter(
is_active=True
).filter(
# Check if metric applies to this incident
models.Q(incident_categories__contains=[instance.category]) |
models.Q(incident_severities__contains=[instance.severity]) |
models.Q(incident_priorities__contains=[instance.priority]) |
models.Q(incident_categories__isnull=True) |
models.Q(incident_severities__isnull=True) |
models.Q(incident_priorities__isnull=True)
)
for metric in applicable_metrics:
# Calculate and update KPI measurement
_calculate_kpi_measurement(metric, instance)
except Exception as e:
logger.error(f"Error updating KPI measurements for incident {instance.id}: {str(e)}")
@receiver(post_save, sender=Incident)
def trigger_anomaly_detection_on_incident(sender, instance, created, **kwargs):
"""Trigger anomaly detection when new incidents are created"""
try:
if created:
# Run anomaly detection for active models
anomaly_service = AnomalyDetectionService()
anomaly_service.run_anomaly_detection()
except Exception as e:
logger.error(f"Error running anomaly detection for incident {instance.id}: {str(e)}")
@receiver(post_save, sender=CostImpactAnalysis)
def update_cost_analytics_on_cost_change(sender, instance, created, **kwargs):
"""Update cost analytics when cost analysis is created or updated"""
try:
# Trigger cost-related KPI updates
cost_metrics = KPIMetric.objects.filter(
is_active=True,
metric_type='COST_IMPACT'
)
for metric in cost_metrics:
_calculate_kpi_measurement(metric, instance.incident)
except Exception as e:
logger.error(f"Error updating cost analytics for cost analysis {instance.id}: {str(e)}")
def _calculate_kpi_measurement(metric, incident):
"""Calculate KPI measurement for a specific metric and incident"""
try:
# Determine time window for calculation
end_time = timezone.now()
start_time = end_time - timedelta(hours=metric.time_window_hours)
# Get incidents in the time window that match the metric criteria
incidents = Incident.objects.filter(
created_at__gte=start_time,
created_at__lte=end_time
)
# Apply metric filters
if metric.incident_categories:
incidents = incidents.filter(category__in=metric.incident_categories)
if metric.incident_severities:
incidents = incidents.filter(severity__in=metric.incident_severities)
if metric.incident_priorities:
incidents = incidents.filter(priority__in=metric.incident_priorities)
# Calculate metric value based on type
if metric.metric_type == 'MTTA':
# Mean Time to Acknowledge
acknowledged_incidents = incidents.filter(
status__in=['IN_PROGRESS', 'RESOLVED', 'CLOSED']
).exclude(assigned_to__isnull=True)
if acknowledged_incidents.exists():
# Calculate average time to acknowledgment
total_time = timedelta()
count = 0
for inc in acknowledged_incidents:
# This is simplified - in practice, you'd need to track acknowledgment time
if inc.updated_at and inc.created_at:
time_diff = inc.updated_at - inc.created_at
total_time += time_diff
count += 1
if count > 0:
avg_time = total_time / count
value = avg_time.total_seconds() / 60 # Convert to minutes
unit = 'minutes'
else:
value = 0
unit = 'minutes'
else:
value = 0
unit = 'minutes'
elif metric.metric_type == 'MTTR':
# Mean Time to Resolve
resolved_incidents = incidents.filter(
status__in=['RESOLVED', 'CLOSED'],
resolved_at__isnull=False
)
if resolved_incidents.exists():
total_time = timedelta()
count = 0
for inc in resolved_incidents:
if inc.resolved_at and inc.created_at:
time_diff = inc.resolved_at - inc.created_at
total_time += time_diff
count += 1
if count > 0:
avg_time = total_time / count
value = avg_time.total_seconds() / 3600 # Convert to hours
unit = 'hours'
else:
value = 0
unit = 'hours'
else:
value = 0
unit = 'hours'
elif metric.metric_type == 'INCIDENT_COUNT':
# Incident Count
value = incidents.count()
unit = 'count'
elif metric.metric_type == 'RESOLUTION_RATE':
# Resolution Rate
total_incidents = incidents.count()
resolved_incidents = incidents.filter(
status__in=['RESOLVED', 'CLOSED']
).count()
if total_incidents > 0:
value = (resolved_incidents / total_incidents) * 100
unit = 'percentage'
else:
value = 0
unit = 'percentage'
else:
# Default calculation
value = incidents.count()
unit = 'count'
# Create or update KPI measurement
measurement, created = KPIMeasurement.objects.get_or_create(
metric=metric,
measurement_period_start=start_time,
measurement_period_end=end_time,
defaults={
'value': value,
'unit': unit,
'incident_count': incidents.count(),
'sample_size': incidents.count()
}
)
if not created:
measurement.value = value
measurement.unit = unit
measurement.incident_count = incidents.count()
measurement.sample_size = incidents.count()
measurement.save()
logger.info(f"Updated KPI measurement for {metric.name}: {value} {unit}")
except Exception as e:
logger.error(f"Error calculating KPI measurement for {metric.name}: {str(e)}")
# Management command signals for scheduled tasks
@receiver(post_save, sender=PredictiveModel)
def schedule_model_training(sender, instance, created, **kwargs):
"""Schedule model training when a new predictive model is created"""
try:
if created and instance.status == 'TRAINING':
# In a real implementation, you would schedule a background task
# For now, we'll just log the event
logger.info(f"Scheduled training for model {instance.name}")
except Exception as e:
logger.error(f"Error scheduling model training for {instance.name}: {str(e)}")
@receiver(post_save, sender=PredictiveModel)
def trigger_model_retraining(sender, instance, created, **kwargs):
"""Trigger model retraining when performance drops below threshold"""
try:
if not created and instance.auto_retrain_enabled:
# Check if model performance is below threshold
if (instance.accuracy_score and
instance.accuracy_score < instance.performance_threshold):
# Update status to retraining
instance.status = 'RETRAINING'
instance.save()
logger.info(f"Triggered retraining for model {instance.name} due to low performance")
except Exception as e:
logger.error(f"Error triggering model retraining for {instance.name}: {str(e)}")

View File

@@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View File

@@ -0,0 +1,50 @@
"""
URL configuration for analytics_predictive_insights app
"""
from django.urls import path, include
from rest_framework.routers import DefaultRouter
from .views.analytics import (
KPIMetricViewSet, KPIMeasurementViewSet, IncidentRecurrenceAnalysisViewSet,
PredictiveModelViewSet, AnomalyDetectionViewSet, CostImpactAnalysisViewSet,
DashboardConfigurationViewSet, HeatmapDataViewSet, PredictiveInsightViewSet
)
# Create router and register viewsets
router = DefaultRouter()
router.register(r'kpi-metrics', KPIMetricViewSet)
router.register(r'kpi-measurements', KPIMeasurementViewSet)
router.register(r'recurrence-analyses', IncidentRecurrenceAnalysisViewSet)
router.register(r'predictive-models', PredictiveModelViewSet)
router.register(r'anomaly-detections', AnomalyDetectionViewSet)
router.register(r'cost-analyses', CostImpactAnalysisViewSet)
router.register(r'dashboard-configurations', DashboardConfigurationViewSet)
router.register(r'heatmap-data', HeatmapDataViewSet)
router.register(r'predictive-insights', PredictiveInsightViewSet)
app_name = 'analytics_predictive_insights'
urlpatterns = [
# Include router URLs
path('', include(router.urls)),
# Additional custom endpoints
path('dashboard/<uuid:dashboard_id>/data/',
DashboardConfigurationViewSet.as_view({'get': 'data'}),
name='dashboard-data'),
path('kpi-metrics/summary/',
KPIMetricViewSet.as_view({'get': 'summary'}),
name='kpi-summary'),
path('anomaly-detections/summary/',
AnomalyDetectionViewSet.as_view({'get': 'summary'}),
name='anomaly-summary'),
path('cost-analyses/summary/',
CostImpactAnalysisViewSet.as_view({'get': 'summary'}),
name='cost-summary'),
path('predictive-insights/summary/',
PredictiveInsightViewSet.as_view({'get': 'summary'}),
name='insight-summary'),
]

View File

@@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View File

@@ -0,0 +1 @@
# Analytics & Predictive Insights views

View File

@@ -0,0 +1,714 @@
"""
Analytics & Predictive Insights views for Enterprise Incident Management API
Implements comprehensive analytics endpoints for KPIs, predictive insights, and dashboards
"""
from rest_framework import viewsets, status, permissions
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.pagination import PageNumberPagination
from django_filters.rest_framework import DjangoFilterBackend
from django_filters import rest_framework as filters
from django.db.models import Q, Avg, Count, Sum, Max, Min
from django.utils import timezone
from datetime import datetime, timedelta
from decimal import Decimal
from ..models import (
KPIMetric, KPIMeasurement, IncidentRecurrenceAnalysis, PredictiveModel,
AnomalyDetection, CostImpactAnalysis, DashboardConfiguration,
HeatmapData, PredictiveInsight
)
from ..serializers.analytics import (
KPIMetricSerializer, KPIMeasurementSerializer, IncidentRecurrenceAnalysisSerializer,
PredictiveModelSerializer, AnomalyDetectionSerializer, CostImpactAnalysisSerializer,
DashboardConfigurationSerializer, HeatmapDataSerializer, PredictiveInsightSerializer,
KPISummarySerializer, AnomalySummarySerializer, CostSummarySerializer,
PredictiveInsightSummarySerializer, DashboardDataSerializer
)
class StandardResultsSetPagination(PageNumberPagination):
"""Standard pagination for analytics endpoints"""
page_size = 20
page_size_query_param = 'page_size'
max_page_size = 100
class KPIMetricFilter(filters.FilterSet):
"""Filter for KPI metrics"""
metric_type = filters.ChoiceFilter(choices=KPIMetric.METRIC_TYPES)
is_active = filters.BooleanFilter()
is_system_metric = filters.BooleanFilter()
created_after = filters.DateTimeFilter(field_name='created_at', lookup_expr='gte')
created_before = filters.DateTimeFilter(field_name='created_at', lookup_expr='lte')
class Meta:
model = KPIMetric
fields = ['metric_type', 'is_active', 'is_system_metric', 'created_after', 'created_before']
class KPIMetricViewSet(viewsets.ModelViewSet):
"""ViewSet for KPI metrics management"""
queryset = KPIMetric.objects.all()
serializer_class = KPIMetricSerializer
pagination_class = StandardResultsSetPagination
filter_backends = [DjangoFilterBackend]
filterset_class = KPIMetricFilter
permission_classes = [permissions.IsAuthenticated]
def get_queryset(self):
"""Filter queryset based on user permissions"""
queryset = super().get_queryset()
# Add any additional filtering based on user permissions
if not self.request.user.is_staff:
# Non-staff users can only see active metrics
queryset = queryset.filter(is_active=True)
return queryset.order_by('-created_at')
@action(detail=True, methods=['get'])
def measurements(self, request, pk=None):
"""Get measurements for a specific KPI metric"""
metric = self.get_object()
measurements = metric.measurements.all().order_by('-calculated_at')
# Apply date filtering if provided
start_date = request.query_params.get('start_date')
end_date = request.query_params.get('end_date')
if start_date:
measurements = measurements.filter(measurement_period_start__gte=start_date)
if end_date:
measurements = measurements.filter(measurement_period_end__lte=end_date)
# Paginate results
paginator = StandardResultsSetPagination()
page = paginator.paginate_queryset(measurements, request)
if page is not None:
serializer = KPIMeasurementSerializer(page, many=True)
return paginator.get_paginated_response(serializer.data)
serializer = KPIMeasurementSerializer(measurements, many=True)
return Response(serializer.data)
@action(detail=False, methods=['get'])
def summary(self, request):
"""Get summary of all KPI metrics"""
metrics = self.get_queryset()
# Get latest measurements for each metric
summaries = []
for metric in metrics:
latest_measurement = metric.measurements.first()
if latest_measurement:
# Calculate trend (simplified - compare with previous measurement)
previous_measurement = metric.measurements.all()[1:2].first()
trend = 'stable'
trend_percentage = Decimal('0.00')
if previous_measurement:
if latest_measurement.value > previous_measurement.value:
trend = 'up'
trend_percentage = ((latest_measurement.value - previous_measurement.value) / previous_measurement.value) * 100
elif latest_measurement.value < previous_measurement.value:
trend = 'down'
trend_percentage = ((previous_measurement.value - latest_measurement.value) / previous_measurement.value) * 100
summary_data = {
'metric_type': metric.metric_type,
'metric_name': metric.name,
'current_value': latest_measurement.value,
'unit': latest_measurement.unit,
'trend': trend,
'trend_percentage': trend_percentage,
'period_start': latest_measurement.measurement_period_start,
'period_end': latest_measurement.measurement_period_end,
'incident_count': latest_measurement.incident_count,
'target_value': None, # Could be added to metric model
'target_met': True # Could be calculated based on target
}
summaries.append(summary_data)
serializer = KPISummarySerializer(summaries, many=True)
return Response(serializer.data)
class KPIMeasurementViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for KPI measurements (read-only)"""
queryset = KPIMeasurement.objects.all()
serializer_class = KPIMeasurementSerializer
pagination_class = StandardResultsSetPagination
permission_classes = [permissions.IsAuthenticated]
def get_queryset(self):
"""Filter queryset based on query parameters"""
queryset = super().get_queryset()
# Filter by metric
metric_id = self.request.query_params.get('metric_id')
if metric_id:
queryset = queryset.filter(metric_id=metric_id)
# Filter by date range
start_date = self.request.query_params.get('start_date')
end_date = self.request.query_params.get('end_date')
if start_date:
queryset = queryset.filter(measurement_period_start__gte=start_date)
if end_date:
queryset = queryset.filter(measurement_period_end__lte=end_date)
return queryset.order_by('-calculated_at')
class IncidentRecurrenceAnalysisViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for incident recurrence analysis"""
queryset = IncidentRecurrenceAnalysis.objects.all()
serializer_class = IncidentRecurrenceAnalysisSerializer
pagination_class = StandardResultsSetPagination
permission_classes = [permissions.IsAuthenticated]
def get_queryset(self):
"""Filter queryset based on query parameters"""
queryset = super().get_queryset()
# Filter by recurrence type
recurrence_type = self.request.query_params.get('recurrence_type')
if recurrence_type:
queryset = queryset.filter(recurrence_type=recurrence_type)
# Filter by confidence score
min_confidence = self.request.query_params.get('min_confidence')
if min_confidence:
queryset = queryset.filter(confidence_score__gte=float(min_confidence))
# Filter by resolution status
is_resolved = self.request.query_params.get('is_resolved')
if is_resolved is not None:
queryset = queryset.filter(is_resolved=is_resolved.lower() == 'true')
return queryset.order_by('-confidence_score', '-created_at')
@action(detail=False, methods=['get'])
def unresolved(self, request):
"""Get unresolved recurrence analyses"""
queryset = self.get_queryset().filter(is_resolved=False)
paginator = StandardResultsSetPagination()
page = paginator.paginate_queryset(queryset, request)
if page is not None:
serializer = self.get_serializer(page, many=True)
return paginator.get_paginated_response(serializer.data)
serializer = self.get_serializer(queryset, many=True)
return Response(serializer.data)
class PredictiveModelViewSet(viewsets.ModelViewSet):
"""ViewSet for predictive models management"""
queryset = PredictiveModel.objects.all()
serializer_class = PredictiveModelSerializer
pagination_class = StandardResultsSetPagination
permission_classes = [permissions.IsAuthenticated]
def get_queryset(self):
"""Filter queryset based on user permissions"""
queryset = super().get_queryset()
# Filter by model type
model_type = self.request.query_params.get('model_type')
if model_type:
queryset = queryset.filter(model_type=model_type)
# Filter by status
status_filter = self.request.query_params.get('status')
if status_filter:
queryset = queryset.filter(status=status_filter)
return queryset.order_by('-created_at')
@action(detail=True, methods=['post'])
def train(self, request, pk=None):
"""Trigger model training"""
model = self.get_object()
# Update model status to training
model.status = 'TRAINING'
model.save()
# Here you would typically trigger the actual training process
# For now, we'll just return a success response
return Response({
'message': f'Training started for model {model.name}',
'model_id': str(model.id),
'status': model.status
}, status=status.HTTP_202_ACCEPTED)
@action(detail=True, methods=['get'])
def performance(self, request, pk=None):
"""Get model performance metrics"""
model = self.get_object()
performance_data = {
'accuracy': model.accuracy_score,
'precision': model.precision_score,
'recall': model.recall_score,
'f1_score': model.f1_score,
'training_samples': model.training_samples_count,
'last_trained': model.last_trained_at,
'training_duration': model.training_duration_seconds,
'insight_count': model.insights.count(),
'anomaly_detection_count': model.anomaly_detections.count()
}
return Response(performance_data)
class AnomalyDetectionViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for anomaly detection results"""
queryset = AnomalyDetection.objects.all()
serializer_class = AnomalyDetectionSerializer
pagination_class = StandardResultsSetPagination
permission_classes = [permissions.IsAuthenticated]
def get_queryset(self):
"""Filter queryset based on query parameters"""
queryset = super().get_queryset()
# Filter by anomaly type
anomaly_type = self.request.query_params.get('anomaly_type')
if anomaly_type:
queryset = queryset.filter(anomaly_type=anomaly_type)
# Filter by severity
severity = self.request.query_params.get('severity')
if severity:
queryset = queryset.filter(severity=severity)
# Filter by status
status_filter = self.request.query_params.get('status')
if status_filter:
queryset = queryset.filter(status=status_filter)
# Filter by date range
start_date = self.request.query_params.get('start_date')
end_date = self.request.query_params.get('end_date')
if start_date:
queryset = queryset.filter(detected_at__gte=start_date)
if end_date:
queryset = queryset.filter(detected_at__lte=end_date)
return queryset.order_by('-detected_at')
@action(detail=False, methods=['get'])
def summary(self, request):
"""Get anomaly detection summary"""
queryset = self.get_queryset()
# Calculate summary statistics
total_anomalies = queryset.count()
critical_anomalies = queryset.filter(severity='CRITICAL').count()
high_anomalies = queryset.filter(severity='HIGH').count()
medium_anomalies = queryset.filter(severity='MEDIUM').count()
low_anomalies = queryset.filter(severity='LOW').count()
unresolved_anomalies = queryset.filter(status__in=['DETECTED', 'INVESTIGATING']).count()
# Calculate false positive rate (simplified)
false_positives = queryset.filter(status='FALSE_POSITIVE').count()
false_positive_rate = (false_positives / total_anomalies * 100) if total_anomalies > 0 else 0
# Calculate average resolution time
resolved_anomalies = queryset.filter(status='RESOLVED', resolved_at__isnull=False)
if resolved_anomalies.exists():
avg_resolution_time = resolved_anomalies.aggregate(
avg_time=Avg('resolved_at' - 'detected_at')
)['avg_time']
else:
avg_resolution_time = None
summary_data = {
'total_anomalies': total_anomalies,
'critical_anomalies': critical_anomalies,
'high_anomalies': high_anomalies,
'medium_anomalies': medium_anomalies,
'low_anomalies': low_anomalies,
'unresolved_anomalies': unresolved_anomalies,
'false_positive_rate': Decimal(str(false_positive_rate)),
'average_resolution_time': avg_resolution_time
}
serializer = AnomalySummarySerializer(summary_data)
return Response(serializer.data)
@action(detail=True, methods=['post'])
def acknowledge(self, request, pk=None):
"""Acknowledge an anomaly detection"""
anomaly = self.get_object()
if anomaly.status == 'DETECTED':
anomaly.status = 'INVESTIGATING'
anomaly.save()
return Response({
'message': 'Anomaly acknowledged and moved to investigating status',
'anomaly_id': str(anomaly.id),
'status': anomaly.status
})
return Response({
'error': 'Anomaly is not in DETECTED status'
}, status=status.HTTP_400_BAD_REQUEST)
@action(detail=True, methods=['post'])
def resolve(self, request, pk=None):
"""Resolve an anomaly detection"""
anomaly = self.get_object()
if anomaly.status in ['DETECTED', 'INVESTIGATING', 'CONFIRMED']:
anomaly.status = 'RESOLVED'
anomaly.resolved_at = timezone.now()
anomaly.resolved_by = request.user
anomaly.save()
return Response({
'message': 'Anomaly resolved',
'anomaly_id': str(anomaly.id),
'status': anomaly.status,
'resolved_at': anomaly.resolved_at
})
return Response({
'error': 'Anomaly cannot be resolved in current status'
}, status=status.HTTP_400_BAD_REQUEST)
class CostImpactAnalysisViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for cost impact analysis"""
queryset = CostImpactAnalysis.objects.all()
serializer_class = CostImpactAnalysisSerializer
pagination_class = StandardResultsSetPagination
permission_classes = [permissions.IsAuthenticated]
def get_queryset(self):
"""Filter queryset based on query parameters"""
queryset = super().get_queryset()
# Filter by cost type
cost_type = self.request.query_params.get('cost_type')
if cost_type:
queryset = queryset.filter(cost_type=cost_type)
# Filter by validation status
is_validated = self.request.query_params.get('is_validated')
if is_validated is not None:
queryset = queryset.filter(is_validated=is_validated.lower() == 'true')
# Filter by date range
start_date = self.request.query_params.get('start_date')
end_date = self.request.query_params.get('end_date')
if start_date:
queryset = queryset.filter(created_at__gte=start_date)
if end_date:
queryset = queryset.filter(created_at__lte=end_date)
return queryset.order_by('-created_at')
@action(detail=False, methods=['get'])
def summary(self, request):
"""Get cost impact summary"""
queryset = self.get_queryset()
# Calculate summary statistics
total_cost = queryset.aggregate(total=Sum('cost_amount'))['total'] or Decimal('0')
downtime_cost = queryset.filter(cost_type='DOWNTIME').aggregate(total=Sum('cost_amount'))['total'] or Decimal('0')
lost_revenue = queryset.filter(cost_type='LOST_REVENUE').aggregate(total=Sum('cost_amount'))['total'] or Decimal('0')
penalty_cost = queryset.filter(cost_type='PENALTY').aggregate(total=Sum('cost_amount'))['total'] or Decimal('0')
resource_cost = queryset.filter(cost_type='RESOURCE_COST').aggregate(total=Sum('cost_amount'))['total'] or Decimal('0')
total_downtime_hours = queryset.aggregate(total=Sum('downtime_hours'))['total'] or Decimal('0')
total_affected_users = queryset.aggregate(total=Sum('affected_users'))['total'] or 0
# Calculate derived metrics
cost_per_hour = (total_cost / total_downtime_hours) if total_downtime_hours > 0 else Decimal('0')
cost_per_user = (total_cost / total_affected_users) if total_affected_users > 0 else Decimal('0')
summary_data = {
'total_cost': total_cost,
'currency': 'USD',
'downtime_cost': downtime_cost,
'lost_revenue': lost_revenue,
'penalty_cost': penalty_cost,
'resource_cost': resource_cost,
'total_downtime_hours': total_downtime_hours,
'total_affected_users': total_affected_users,
'cost_per_hour': cost_per_hour,
'cost_per_user': cost_per_user
}
serializer = CostSummarySerializer(summary_data)
return Response(serializer.data)
class DashboardConfigurationViewSet(viewsets.ModelViewSet):
"""ViewSet for dashboard configurations"""
queryset = DashboardConfiguration.objects.all()
serializer_class = DashboardConfigurationSerializer
pagination_class = StandardResultsSetPagination
permission_classes = [permissions.IsAuthenticated]
def get_queryset(self):
"""Filter queryset based on user permissions"""
queryset = super().get_queryset()
# Filter by dashboard type
dashboard_type = self.request.query_params.get('dashboard_type')
if dashboard_type:
queryset = queryset.filter(dashboard_type=dashboard_type)
# Filter by active status
is_active = self.request.query_params.get('is_active')
if is_active is not None:
queryset = queryset.filter(is_active=is_active.lower() == 'true')
# Filter by public dashboards or user's accessible dashboards
if not self.request.user.is_staff:
queryset = queryset.filter(
Q(is_public=True) | Q(allowed_users=self.request.user)
)
return queryset.order_by('name')
@action(detail=True, methods=['get'])
def data(self, request, pk=None):
"""Get dashboard data"""
dashboard = self.get_object()
# Check if user has access to this dashboard
if not dashboard.is_public and self.request.user not in dashboard.allowed_users.all():
return Response({
'error': 'Access denied to this dashboard'
}, status=status.HTTP_403_FORBIDDEN)
# Get KPI summary
kpi_metrics = KPIMetric.objects.filter(is_active=True)
kpi_summaries = []
for metric in kpi_metrics:
latest_measurement = metric.measurements.first()
if latest_measurement:
kpi_summaries.append({
'metric_type': metric.metric_type,
'metric_name': metric.name,
'current_value': latest_measurement.value,
'unit': latest_measurement.unit,
'trend': 'stable', # Simplified
'trend_percentage': Decimal('0.00'),
'period_start': latest_measurement.measurement_period_start,
'period_end': latest_measurement.measurement_period_end,
'incident_count': latest_measurement.incident_count,
'target_value': None,
'target_met': True
})
# Get anomaly summary
anomalies = AnomalyDetection.objects.all()
anomaly_summary = {
'total_anomalies': anomalies.count(),
'critical_anomalies': anomalies.filter(severity='CRITICAL').count(),
'high_anomalies': anomalies.filter(severity='HIGH').count(),
'medium_anomalies': anomalies.filter(severity='MEDIUM').count(),
'low_anomalies': anomalies.filter(severity='LOW').count(),
'unresolved_anomalies': anomalies.filter(status__in=['DETECTED', 'INVESTIGATING']).count(),
'false_positive_rate': Decimal('0.00'), # Simplified
'average_resolution_time': None
}
# Get cost summary
cost_analyses = CostImpactAnalysis.objects.all()
cost_summary = {
'total_cost': cost_analyses.aggregate(total=Sum('cost_amount'))['total'] or Decimal('0'),
'currency': 'USD',
'downtime_cost': cost_analyses.filter(cost_type='DOWNTIME').aggregate(total=Sum('cost_amount'))['total'] or Decimal('0'),
'lost_revenue': cost_analyses.filter(cost_type='LOST_REVENUE').aggregate(total=Sum('cost_amount'))['total'] or Decimal('0'),
'penalty_cost': cost_analyses.filter(cost_type='PENALTY').aggregate(total=Sum('cost_amount'))['total'] or Decimal('0'),
'resource_cost': cost_analyses.filter(cost_type='RESOURCE_COST').aggregate(total=Sum('cost_amount'))['total'] or Decimal('0'),
'total_downtime_hours': cost_analyses.aggregate(total=Sum('downtime_hours'))['total'] or Decimal('0'),
'total_affected_users': cost_analyses.aggregate(total=Sum('affected_users'))['total'] or 0,
'cost_per_hour': Decimal('0.00'),
'cost_per_user': Decimal('0.00')
}
# Get insight summary
insights = PredictiveInsight.objects.all()
insight_summary = {
'total_insights': insights.count(),
'high_confidence_insights': insights.filter(confidence_level='HIGH').count(),
'medium_confidence_insights': insights.filter(confidence_level='MEDIUM').count(),
'low_confidence_insights': insights.filter(confidence_level='LOW').count(),
'acknowledged_insights': insights.filter(is_acknowledged=True).count(),
'validated_insights': insights.filter(is_validated=True).count(),
'expired_insights': insights.filter(expires_at__lt=timezone.now()).count(),
'average_accuracy': Decimal('0.00'),
'active_models': PredictiveModel.objects.filter(status='ACTIVE').count()
}
# Get recent data
recent_anomalies = anomalies.order_by('-detected_at')[:5]
recent_insights = insights.order_by('-generated_at')[:5]
heatmap_data = HeatmapData.objects.all()[:3]
dashboard_data = {
'kpi_summary': kpi_summaries,
'anomaly_summary': anomaly_summary,
'cost_summary': cost_summary,
'insight_summary': insight_summary,
'recent_anomalies': AnomalyDetectionSerializer(recent_anomalies, many=True).data,
'recent_insights': PredictiveInsightSerializer(recent_insights, many=True).data,
'heatmap_data': HeatmapDataSerializer(heatmap_data, many=True).data,
'last_updated': timezone.now()
}
serializer = DashboardDataSerializer(dashboard_data)
return Response(serializer.data)
class HeatmapDataViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for heatmap data"""
queryset = HeatmapData.objects.all()
serializer_class = HeatmapDataSerializer
pagination_class = StandardResultsSetPagination
permission_classes = [permissions.IsAuthenticated]
def get_queryset(self):
"""Filter queryset based on query parameters"""
queryset = super().get_queryset()
# Filter by heatmap type
heatmap_type = self.request.query_params.get('heatmap_type')
if heatmap_type:
queryset = queryset.filter(heatmap_type=heatmap_type)
# Filter by time granularity
time_granularity = self.request.query_params.get('time_granularity')
if time_granularity:
queryset = queryset.filter(time_granularity=time_granularity)
return queryset.order_by('-created_at')
class PredictiveInsightViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for predictive insights"""
queryset = PredictiveInsight.objects.all()
serializer_class = PredictiveInsightSerializer
pagination_class = StandardResultsSetPagination
permission_classes = [permissions.IsAuthenticated]
def get_queryset(self):
"""Filter queryset based on query parameters"""
queryset = super().get_queryset()
# Filter by insight type
insight_type = self.request.query_params.get('insight_type')
if insight_type:
queryset = queryset.filter(insight_type=insight_type)
# Filter by confidence level
confidence_level = self.request.query_params.get('confidence_level')
if confidence_level:
queryset = queryset.filter(confidence_level=confidence_level)
# Filter by acknowledgment status
is_acknowledged = self.request.query_params.get('is_acknowledged')
if is_acknowledged is not None:
queryset = queryset.filter(is_acknowledged=is_acknowledged.lower() == 'true')
# Filter by validation status
is_validated = self.request.query_params.get('is_validated')
if is_validated is not None:
queryset = queryset.filter(is_validated=is_validated.lower() == 'true')
# Filter by expiry
include_expired = self.request.query_params.get('include_expired', 'false')
if include_expired.lower() != 'true':
queryset = queryset.filter(expires_at__gt=timezone.now())
return queryset.order_by('-generated_at')
@action(detail=True, methods=['post'])
def acknowledge(self, request, pk=None):
"""Acknowledge a predictive insight"""
insight = self.get_object()
if not insight.is_acknowledged:
insight.is_acknowledged = True
insight.acknowledged_by = request.user
insight.acknowledged_at = timezone.now()
insight.save()
return Response({
'message': 'Insight acknowledged',
'insight_id': str(insight.id),
'acknowledged_at': insight.acknowledged_at
})
return Response({
'error': 'Insight is already acknowledged'
}, status=status.HTTP_400_BAD_REQUEST)
@action(detail=False, methods=['get'])
def summary(self, request):
"""Get predictive insight summary"""
queryset = self.get_queryset()
# Calculate summary statistics
total_insights = queryset.count()
high_confidence_insights = queryset.filter(confidence_level='HIGH').count()
medium_confidence_insights = queryset.filter(confidence_level='MEDIUM').count()
low_confidence_insights = queryset.filter(confidence_level='LOW').count()
acknowledged_insights = queryset.filter(is_acknowledged=True).count()
validated_insights = queryset.filter(is_validated=True).count()
expired_insights = queryset.filter(expires_at__lt=timezone.now()).count()
# Calculate average accuracy
validated_insights_with_accuracy = queryset.filter(
is_validated=True,
validation_accuracy__isnull=False
)
if validated_insights_with_accuracy.exists():
avg_accuracy = validated_insights_with_accuracy.aggregate(
avg=Avg('validation_accuracy')
)['avg']
else:
avg_accuracy = None
active_models = PredictiveModel.objects.filter(status='ACTIVE').count()
summary_data = {
'total_insights': total_insights,
'high_confidence_insights': high_confidence_insights,
'medium_confidence_insights': medium_confidence_insights,
'low_confidence_insights': low_confidence_insights,
'acknowledged_insights': acknowledged_insights,
'validated_insights': validated_insights,
'expired_insights': expired_insights,
'average_accuracy': avg_accuracy,
'active_models': active_models
}
serializer = PredictiveInsightSummarySerializer(summary_data)
return Response(serializer.data)

View File

@@ -0,0 +1,477 @@
# Automation & Orchestration API Documentation
## Overview
The Automation & Orchestration module provides comprehensive automation capabilities for incident management, including runbooks, integrations with external systems, ChatOps functionality, auto-remediation, and maintenance window management.
## Features
### 1. Runbooks Automation
- **Predefined Response Steps**: Create and manage automated response procedures
- **Multiple Trigger Types**: Manual, automatic, scheduled, webhook, and ChatOps triggers
- **Execution Tracking**: Monitor runbook execution status and performance
- **Version Control**: Track runbook versions and changes
### 2. External System Integrations
- **ITSM Tools**: Jira, ServiceNow integration
- **CI/CD Tools**: GitHub, Jenkins, Ansible, Terraform
- **Chat Platforms**: Slack, Microsoft Teams, Discord, Mattermost
- **Generic APIs**: Webhook and API integrations
- **Health Monitoring**: Integration health checks and status tracking
### 3. ChatOps Integration
- **Command Execution**: Trigger workflows from chat platforms
- **Security Controls**: User and channel-based access control
- **Command History**: Track and audit ChatOps commands
- **Multi-Platform Support**: Slack, Teams, Discord, Mattermost
### 4. Auto-Remediation
- **Automatic Response**: Trigger remediation actions based on incident conditions
- **Safety Controls**: Approval workflows and execution limits
- **Multiple Remediation Types**: Service restart, deployment rollback, scaling, etc.
- **Execution Tracking**: Monitor remediation success rates and performance
### 5. Maintenance Windows
- **Scheduled Suppression**: Suppress alerts during planned maintenance
- **Service-Specific**: Target specific services and components
- **Flexible Configuration**: Control incident creation, notifications, and escalations
- **Status Management**: Automatic status updates based on schedule
### 6. Workflow Templates
- **Reusable Workflows**: Create templates for common automation scenarios
- **Parameterized Execution**: Support for input parameters and output schemas
- **Template Types**: Incident response, deployment, maintenance, scaling, monitoring
- **Usage Tracking**: Monitor template usage and performance
## API Endpoints
### Runbooks
#### List Runbooks
```
GET /api/automation/runbooks/
```
**Query Parameters:**
- `status`: Filter by status (DRAFT, ACTIVE, INACTIVE, DEPRECATED)
- `trigger_type`: Filter by trigger type (MANUAL, AUTOMATIC, SCHEDULED, WEBHOOK, CHATOPS)
- `category`: Filter by category
- `is_public`: Filter by public/private status
- `search`: Search in name, description, category
**Response:**
```json
{
"count": 10,
"next": null,
"previous": null,
"results": [
{
"id": "uuid",
"name": "Database Service Restart",
"description": "Automated runbook for restarting database services",
"version": "1.0",
"trigger_type": "AUTOMATIC",
"trigger_conditions": {
"severity": ["CRITICAL", "EMERGENCY"],
"category": "database"
},
"steps": [...],
"estimated_duration": "00:05:00",
"category": "database",
"tags": ["database", "restart", "automation"],
"status": "ACTIVE",
"is_public": true,
"execution_count": 5,
"success_rate": 0.8,
"can_trigger": true,
"created_at": "2024-01-15T10:00:00Z",
"updated_at": "2024-01-15T10:00:00Z"
}
]
}
```
#### Create Runbook
```
POST /api/automation/runbooks/
```
**Request Body:**
```json
{
"name": "New Runbook",
"description": "Description of the runbook",
"version": "1.0",
"trigger_type": "MANUAL",
"trigger_conditions": {
"severity": ["HIGH", "CRITICAL"]
},
"steps": [
{
"name": "Step 1",
"action": "check_status",
"timeout": 30,
"parameters": {"service": "web"}
}
],
"estimated_duration": "00:05:00",
"category": "web",
"tags": ["web", "restart"],
"status": "DRAFT",
"is_public": true
}
```
#### Execute Runbook
```
POST /api/automation/runbooks/{id}/execute/
```
**Request Body:**
```json
{
"trigger_data": {
"incident_id": "uuid",
"context": "additional context"
}
}
```
### Integrations
#### List Integrations
```
GET /api/automation/integrations/
```
**Query Parameters:**
- `integration_type`: Filter by type (JIRA, GITHUB, JENKINS, etc.)
- `status`: Filter by status (ACTIVE, INACTIVE, ERROR, CONFIGURING)
- `health_status`: Filter by health status (HEALTHY, WARNING, ERROR, UNKNOWN)
#### Test Integration Connection
```
POST /api/automation/integrations/{id}/test_connection/
```
#### Perform Health Check
```
POST /api/automation/integrations/{id}/health_check/
```
### ChatOps
#### List ChatOps Integrations
```
GET /api/automation/chatops-integrations/
```
#### List ChatOps Commands
```
GET /api/automation/chatops-commands/
```
**Query Parameters:**
- `status`: Filter by execution status
- `chatops_integration`: Filter by integration
- `command`: Filter by command name
- `user_id`: Filter by user ID
- `channel_id`: Filter by channel ID
### Auto-Remediation
#### List Auto-Remediations
```
GET /api/automation/auto-remediations/
```
**Query Parameters:**
- `remediation_type`: Filter by type (SERVICE_RESTART, DEPLOYMENT_ROLLBACK, etc.)
- `trigger_condition_type`: Filter by trigger condition type
- `is_active`: Filter by active status
- `requires_approval`: Filter by approval requirement
#### Approve Auto-Remediation Execution
```
POST /api/automation/auto-remediation-executions/{id}/approve/
```
**Request Body:**
```json
{
"approval_notes": "Approved for execution"
}
```
#### Reject Auto-Remediation Execution
```
POST /api/automation/auto-remediation-executions/{id}/reject/
```
**Request Body:**
```json
{
"rejection_notes": "Rejected due to risk concerns"
}
```
### Maintenance Windows
#### List Maintenance Windows
```
GET /api/automation/maintenance-windows/
```
#### Get Active Maintenance Windows
```
GET /api/automation/maintenance-windows/active/
```
#### Get Upcoming Maintenance Windows
```
GET /api/automation/maintenance-windows/upcoming/
```
### Workflow Templates
#### List Workflow Templates
```
GET /api/automation/workflow-templates/
```
**Query Parameters:**
- `template_type`: Filter by type (INCIDENT_RESPONSE, DEPLOYMENT, etc.)
- `is_public`: Filter by public/private status
## Data Models
### Runbook
- **id**: UUID primary key
- **name**: Unique name for the runbook
- **description**: Detailed description
- **version**: Version string
- **trigger_type**: How the runbook is triggered
- **trigger_conditions**: JSON conditions for triggering
- **steps**: JSON array of execution steps
- **estimated_duration**: Expected execution time
- **category**: Categorization
- **tags**: JSON array of tags
- **status**: Current status
- **is_public**: Public/private visibility
- **execution_count**: Number of executions
- **success_rate**: Success rate (0.0-1.0)
### Integration
- **id**: UUID primary key
- **name**: Unique name for the integration
- **integration_type**: Type of integration (JIRA, GITHUB, etc.)
- **description**: Description
- **configuration**: JSON configuration data
- **authentication_config**: JSON authentication data
- **status**: Integration status
- **health_status**: Health status
- **request_count**: Number of requests made
- **last_used_at**: Last usage timestamp
### ChatOpsIntegration
- **id**: UUID primary key
- **name**: Unique name
- **platform**: Chat platform (SLACK, TEAMS, etc.)
- **webhook_url**: Webhook URL
- **bot_token**: Bot authentication token
- **channel_id**: Default channel ID
- **command_prefix**: Command prefix character
- **available_commands**: JSON array of available commands
- **allowed_users**: JSON array of allowed user IDs
- **allowed_channels**: JSON array of allowed channel IDs
- **is_active**: Active status
### AutoRemediation
- **id**: UUID primary key
- **name**: Unique name
- **description**: Description
- **remediation_type**: Type of remediation action
- **trigger_conditions**: JSON trigger conditions
- **trigger_condition_type**: Type of trigger condition
- **remediation_config**: JSON remediation configuration
- **timeout_seconds**: Execution timeout
- **requires_approval**: Whether approval is required
- **approval_users**: Many-to-many relationship with users
- **max_executions_per_incident**: Maximum executions per incident
- **is_active**: Active status
- **execution_count**: Number of executions
- **success_count**: Number of successful executions
### MaintenanceWindow
- **id**: UUID primary key
- **name**: Name of the maintenance window
- **description**: Description
- **start_time**: Start datetime
- **end_time**: End datetime
- **timezone**: Timezone
- **affected_services**: JSON array of affected services
- **affected_components**: JSON array of affected components
- **suppress_incident_creation**: Whether to suppress incident creation
- **suppress_notifications**: Whether to suppress notifications
- **suppress_escalations**: Whether to suppress escalations
- **status**: Current status
- **incidents_suppressed**: Count of suppressed incidents
- **notifications_suppressed**: Count of suppressed notifications
### WorkflowTemplate
- **id**: UUID primary key
- **name**: Unique name
- **description**: Description
- **template_type**: Type of workflow template
- **workflow_steps**: JSON array of workflow steps
- **input_parameters**: JSON array of input parameters
- **output_schema**: JSON output schema
- **usage_count**: Number of times used
- **is_public**: Public/private visibility
## Security Features
### Access Control
- **User Permissions**: Role-based access control
- **Data Classification**: Integration with security module
- **Audit Logging**: Comprehensive audit trails
- **API Authentication**: Token and session authentication
### ChatOps Security
- **User Whitelisting**: Restrict commands to specific users
- **Channel Restrictions**: Limit commands to specific channels
- **Command Validation**: Validate command parameters
- **Execution Logging**: Log all command executions
### Auto-Remediation Safety
- **Approval Workflows**: Require manual approval for sensitive actions
- **Execution Limits**: Limit executions per incident
- **Timeout Controls**: Prevent runaway executions
- **Rollback Capabilities**: Support for rollback operations
## Integration with Other Modules
### Incident Intelligence Integration
- **Automatic Triggering**: Trigger runbooks based on incident characteristics
- **AI Suggestions**: AI-driven runbook recommendations
- **Correlation**: Link automation actions to incident patterns
- **Maintenance Suppression**: Suppress incidents during maintenance windows
### Security Module Integration
- **Access Control**: Use security module for authentication and authorization
- **Data Classification**: Apply data classification to automation data
- **Audit Integration**: Integrate with security audit trails
- **MFA Support**: Support multi-factor authentication for sensitive operations
## Best Practices
### Runbook Design
1. **Clear Steps**: Define clear, atomic steps
2. **Error Handling**: Include error handling and rollback procedures
3. **Timeout Management**: Set appropriate timeouts for each step
4. **Documentation**: Provide clear documentation for each step
5. **Testing**: Test runbooks in non-production environments
### Integration Management
1. **Health Monitoring**: Regularly monitor integration health
2. **Credential Management**: Securely store and rotate credentials
3. **Rate Limiting**: Implement appropriate rate limiting
4. **Error Handling**: Handle integration failures gracefully
5. **Monitoring**: Monitor integration usage and performance
### Auto-Remediation
1. **Conservative Approach**: Start with low-risk remediations
2. **Approval Workflows**: Use approval workflows for high-risk actions
3. **Monitoring**: Monitor remediation success rates
4. **Documentation**: Document all remediation actions
5. **Testing**: Test remediations in controlled environments
### Maintenance Windows
1. **Communication**: Communicate maintenance windows to stakeholders
2. **Scope Definition**: Clearly define affected services and components
3. **Rollback Plans**: Have rollback plans for maintenance activities
4. **Monitoring**: Monitor system health during maintenance
5. **Documentation**: Document maintenance activities and outcomes
## Error Handling
### Common Error Scenarios
1. **Integration Failures**: Handle external system unavailability
2. **Authentication Errors**: Handle credential expiration
3. **Timeout Errors**: Handle execution timeouts
4. **Permission Errors**: Handle insufficient permissions
5. **Data Validation Errors**: Handle invalid input data
### Error Response Format
```json
{
"error": "Error message",
"code": "ERROR_CODE",
"details": {
"field": "specific field error"
},
"timestamp": "2024-01-15T10:00:00Z"
}
```
## Rate Limiting
### Default Limits
- **API Requests**: 1000 requests per hour per user
- **Runbook Executions**: 10 executions per hour per user
- **Integration Calls**: 100 calls per hour per integration
- **ChatOps Commands**: 50 commands per hour per user
### Custom Limits
- Configure custom rate limits per user role
- Set different limits for different integration types
- Implement burst allowances for emergency situations
## Monitoring and Alerting
### Key Metrics
- **Runbook Success Rate**: Track runbook execution success
- **Integration Health**: Monitor integration availability
- **Auto-Remediation Effectiveness**: Track remediation success
- **ChatOps Usage**: Monitor ChatOps command usage
- **Maintenance Window Impact**: Track maintenance window effectiveness
### Alerting
- **Integration Failures**: Alert on integration health issues
- **Runbook Failures**: Alert on runbook execution failures
- **Auto-Remediation Issues**: Alert on remediation failures
- **Rate Limit Exceeded**: Alert on rate limit violations
- **Security Issues**: Alert on security-related events
## Troubleshooting
### Common Issues
1. **Runbook Execution Failures**: Check step configurations and permissions
2. **Integration Connection Issues**: Verify credentials and network connectivity
3. **ChatOps Command Failures**: Check user permissions and command syntax
4. **Auto-Remediation Not Triggering**: Verify trigger conditions and permissions
5. **Maintenance Window Not Working**: Check timezone and schedule configuration
### Debug Information
- Enable debug logging for detailed execution information
- Use execution logs to trace runbook and workflow execution
- Check integration health status and error messages
- Review audit logs for security and access issues
- Monitor system metrics for performance issues
## Future Enhancements
### Planned Features
1. **Visual Workflow Builder**: Drag-and-drop workflow creation
2. **Advanced AI Integration**: Enhanced AI-driven automation suggestions
3. **Multi-Cloud Support**: Support for multiple cloud providers
4. **Advanced Analytics**: Enhanced reporting and analytics capabilities
5. **Mobile Support**: Mobile app for automation management
### Integration Roadmap
1. **Additional ITSM Tools**: ServiceNow, Remedy, etc.
2. **Cloud Platforms**: AWS, Azure, GCP integrations
3. **Monitoring Tools**: Prometheus, Grafana, DataDog
4. **Communication Platforms**: Additional chat platforms
5. **Development Tools**: GitLab, Bitbucket, CircleCI

View File

@@ -0,0 +1,149 @@
"""
Admin configuration for automation_orchestration app
"""
from django.contrib import admin
from django.utils.html import format_html
from django.urls import reverse
from django.utils.safestring import mark_safe
from .models import (
Runbook,
RunbookExecution,
Integration,
ChatOpsIntegration,
ChatOpsCommand,
AutoRemediation,
AutoRemediationExecution,
MaintenanceWindow,
WorkflowTemplate,
WorkflowExecution,
)
@admin.register(Runbook)
class RunbookAdmin(admin.ModelAdmin):
list_display = [
'name', 'version', 'trigger_type', 'status', 'category',
'execution_count', 'success_rate', 'is_public', 'created_by', 'created_at'
]
list_filter = ['status', 'trigger_type', 'category', 'is_public', 'created_at']
search_fields = ['name', 'description', 'category']
readonly_fields = ['id', 'execution_count', 'success_rate', 'created_at', 'updated_at', 'last_executed_at']
fieldsets = (
('Basic Information', {
'fields': ('id', 'name', 'description', 'version', 'category', 'tags')
}),
('Trigger Configuration', {
'fields': ('trigger_type', 'trigger_conditions')
}),
('Content', {
'fields': ('steps', 'estimated_duration')
}),
('Status & Permissions', {
'fields': ('status', 'is_public')
}),
('Metadata', {
'fields': ('created_by', 'last_modified_by', 'created_at', 'updated_at', 'last_executed_at'),
'classes': ('collapse',)
}),
('Statistics', {
'fields': ('execution_count', 'success_rate'),
'classes': ('collapse',)
}),
)
@admin.register(RunbookExecution)
class RunbookExecutionAdmin(admin.ModelAdmin):
list_display = [
'runbook', 'triggered_by', 'trigger_type', 'status',
'current_step', 'total_steps', 'started_at', 'duration'
]
list_filter = ['status', 'trigger_type', 'started_at']
search_fields = ['runbook__name', 'triggered_by__username', 'incident__title']
readonly_fields = ['id', 'started_at', 'completed_at', 'duration']
@admin.register(Integration)
class IntegrationAdmin(admin.ModelAdmin):
list_display = [
'name', 'integration_type', 'status', 'health_status',
'request_count', 'last_used_at', 'created_by'
]
list_filter = ['integration_type', 'status', 'health_status', 'created_at']
search_fields = ['name', 'description']
readonly_fields = ['id', 'request_count', 'last_used_at', 'created_at', 'updated_at', 'last_health_check']
@admin.register(ChatOpsIntegration)
class ChatOpsIntegrationAdmin(admin.ModelAdmin):
list_display = [
'name', 'platform', 'is_active', 'last_activity', 'created_by'
]
list_filter = ['platform', 'is_active', 'created_at']
search_fields = ['name']
readonly_fields = ['id', 'last_activity', 'created_at', 'updated_at']
@admin.register(ChatOpsCommand)
class ChatOpsCommandAdmin(admin.ModelAdmin):
list_display = [
'command', 'chatops_integration', 'user_id', 'status',
'executed_at', 'completed_at'
]
list_filter = ['status', 'chatops_integration__platform', 'executed_at']
search_fields = ['command', 'user_id', 'channel_id']
readonly_fields = ['id', 'executed_at', 'completed_at']
@admin.register(AutoRemediation)
class AutoRemediationAdmin(admin.ModelAdmin):
list_display = [
'name', 'remediation_type', 'trigger_condition_type',
'is_active', 'requires_approval', 'execution_count', 'success_count'
]
list_filter = ['remediation_type', 'trigger_condition_type', 'is_active', 'requires_approval']
search_fields = ['name', 'description']
readonly_fields = ['id', 'execution_count', 'success_count', 'last_executed_at', 'created_at', 'updated_at']
@admin.register(AutoRemediationExecution)
class AutoRemediationExecutionAdmin(admin.ModelAdmin):
list_display = [
'auto_remediation', 'incident', 'status', 'triggered_at',
'approved_by', 'completed_at'
]
list_filter = ['status', 'triggered_at', 'auto_remediation__remediation_type']
search_fields = ['auto_remediation__name', 'incident__title', 'approved_by__username']
readonly_fields = ['id', 'triggered_at', 'started_at', 'completed_at', 'duration']
@admin.register(MaintenanceWindow)
class MaintenanceWindowAdmin(admin.ModelAdmin):
list_display = [
'name', 'start_time', 'end_time', 'status',
'incidents_suppressed', 'notifications_suppressed', 'created_by'
]
list_filter = ['status', 'start_time', 'end_time']
search_fields = ['name', 'description']
readonly_fields = ['id', 'incidents_suppressed', 'notifications_suppressed', 'created_at', 'updated_at']
@admin.register(WorkflowTemplate)
class WorkflowTemplateAdmin(admin.ModelAdmin):
list_display = [
'name', 'template_type', 'usage_count', 'is_public', 'created_by'
]
list_filter = ['template_type', 'is_public', 'created_at']
search_fields = ['name', 'description']
readonly_fields = ['id', 'usage_count', 'created_at', 'updated_at']
@admin.register(WorkflowExecution)
class WorkflowExecutionAdmin(admin.ModelAdmin):
list_display = [
'name', 'workflow_template', 'triggered_by', 'status',
'current_step', 'total_steps', 'started_at', 'duration'
]
list_filter = ['status', 'trigger_type', 'started_at']
search_fields = ['name', 'workflow_template__name', 'triggered_by__username']
readonly_fields = ['id', 'started_at', 'completed_at', 'duration']

View File

@@ -0,0 +1,11 @@
from django.apps import AppConfig
class AutomationOrchestrationConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'automation_orchestration'
verbose_name = 'Automation & Orchestration'
def ready(self):
"""Import signal handlers when the app is ready"""
import automation_orchestration.signals

View File

@@ -0,0 +1,433 @@
"""
Management command to set up automation & orchestration module
"""
from django.core.management.base import BaseCommand
from django.contrib.auth import get_user_model
from datetime import timedelta, datetime
from django.utils import timezone
from automation_orchestration.models import (
Runbook,
Integration,
ChatOpsIntegration,
AutoRemediation,
MaintenanceWindow,
WorkflowTemplate,
)
User = get_user_model()
class Command(BaseCommand):
help = 'Set up automation & orchestration module with sample data'
def add_arguments(self, parser):
parser.add_argument(
'--reset',
action='store_true',
help='Reset existing data before creating new data',
)
def handle(self, *args, **options):
if options['reset']:
self.stdout.write('Resetting existing automation data...')
self.reset_data()
self.stdout.write('Setting up automation & orchestration module...')
# Create sample runbooks
self.create_sample_runbooks()
# Create sample integrations
self.create_sample_integrations()
# Create sample ChatOps integrations
self.create_sample_chatops_integrations()
# Create sample auto-remediations
self.create_sample_auto_remediations()
# Create sample maintenance windows
self.create_sample_maintenance_windows()
# Create sample workflow templates
self.create_sample_workflow_templates()
self.stdout.write(
self.style.SUCCESS('Successfully set up automation & orchestration module!')
)
def reset_data(self):
"""Reset existing automation data"""
Runbook.objects.all().delete()
Integration.objects.all().delete()
ChatOpsIntegration.objects.all().delete()
AutoRemediation.objects.all().delete()
MaintenanceWindow.objects.all().delete()
WorkflowTemplate.objects.all().delete()
def create_sample_runbooks(self):
"""Create sample runbooks"""
self.stdout.write('Creating sample runbooks...')
# Get or create a superuser for sample data
admin_user = User.objects.filter(is_superuser=True).first()
if not admin_user:
admin_user = User.objects.create_superuser(
username='admin',
email='admin@example.com',
password='admin123'
)
# Sample runbook 1: Database restart
runbook1, created = Runbook.objects.get_or_create(
name='Database Service Restart',
defaults={
'description': 'Automated runbook for restarting database services',
'version': '1.0',
'trigger_type': 'AUTOMATIC',
'trigger_conditions': {
'severity': ['CRITICAL', 'EMERGENCY'],
'category': 'database',
'keywords': ['database', 'connection', 'timeout']
},
'steps': [
{
'name': 'Check database status',
'action': 'check_service_status',
'timeout': 30,
'parameters': {'service': 'postgresql'}
},
{
'name': 'Stop database service',
'action': 'stop_service',
'timeout': 60,
'parameters': {'service': 'postgresql'}
},
{
'name': 'Start database service',
'action': 'start_service',
'timeout': 120,
'parameters': {'service': 'postgresql'}
},
{
'name': 'Verify database connectivity',
'action': 'verify_connectivity',
'timeout': 30,
'parameters': {'host': 'localhost', 'port': 5432}
}
],
'estimated_duration': timedelta(minutes=5),
'category': 'database',
'tags': ['database', 'restart', 'automation'],
'status': 'ACTIVE',
'is_public': True,
'created_by': admin_user
}
)
# Sample runbook 2: Web server scaling
runbook2, created = Runbook.objects.get_or_create(
name='Web Server Scale Up',
defaults={
'description': 'Automated runbook for scaling up web servers',
'version': '1.0',
'trigger_type': 'AUTOMATIC',
'trigger_conditions': {
'severity': ['HIGH', 'CRITICAL'],
'category': 'performance',
'metrics': {'cpu_usage': '>80', 'response_time': '>2000'}
},
'steps': [
{
'name': 'Check current load',
'action': 'check_metrics',
'timeout': 30,
'parameters': {'metrics': ['cpu', 'memory', 'response_time']}
},
{
'name': 'Scale up instances',
'action': 'scale_instances',
'timeout': 300,
'parameters': {'count': 2, 'instance_type': 'web'}
},
{
'name': 'Update load balancer',
'action': 'update_load_balancer',
'timeout': 60,
'parameters': {'new_instances': True}
},
{
'name': 'Verify scaling',
'action': 'verify_scaling',
'timeout': 120,
'parameters': {'expected_instances': '+2'}
}
],
'estimated_duration': timedelta(minutes=10),
'category': 'scaling',
'tags': ['scaling', 'performance', 'web'],
'status': 'ACTIVE',
'is_public': True,
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created runbook: {runbook1.name}')
self.stdout.write(f' Created runbook: {runbook2.name}')
else:
self.stdout.write(' Sample runbooks already exist')
def create_sample_integrations(self):
"""Create sample integrations"""
self.stdout.write('Creating sample integrations...')
admin_user = User.objects.filter(is_superuser=True).first()
# Jira integration
jira_integration, created = Integration.objects.get_or_create(
name='Jira Production',
defaults={
'integration_type': 'JIRA',
'description': 'Jira integration for production environment',
'configuration': {
'base_url': 'https://company.atlassian.net',
'project_key': 'PROD',
'issue_type': 'Bug'
},
'authentication_config': {
'auth_type': 'basic',
'username': 'jira_user',
'api_token': 'encrypted_token_here'
},
'status': 'ACTIVE',
'health_status': 'HEALTHY',
'created_by': admin_user
}
)
# GitHub integration
github_integration, created = Integration.objects.get_or_create(
name='GitHub Main Repository',
defaults={
'integration_type': 'GITHUB',
'description': 'GitHub integration for main repository',
'configuration': {
'repository': 'company/main-repo',
'branch': 'main',
'webhook_secret': 'webhook_secret_here'
},
'authentication_config': {
'auth_type': 'token',
'access_token': 'encrypted_token_here'
},
'status': 'ACTIVE',
'health_status': 'HEALTHY',
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created integration: {jira_integration.name}')
self.stdout.write(f' Created integration: {github_integration.name}')
else:
self.stdout.write(' Sample integrations already exist')
def create_sample_chatops_integrations(self):
"""Create sample ChatOps integrations"""
self.stdout.write('Creating sample ChatOps integrations...')
admin_user = User.objects.filter(is_superuser=True).first()
# Slack integration
slack_integration, created = ChatOpsIntegration.objects.get_or_create(
name='Production Slack',
defaults={
'platform': 'SLACK',
'webhook_url': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX',
'bot_token': 'xoxb-0000000000000-0000000000000-XXXXXXXXXXXXXXXXXXXXXXXX',
'channel_id': 'C0000000000',
'command_prefix': '!',
'available_commands': [
{
'name': 'incident',
'description': 'Create or manage incidents',
'usage': '!incident create "title" "description"'
},
{
'name': 'status',
'description': 'Check system status',
'usage': '!status [service]'
},
{
'name': 'runbook',
'description': 'Execute runbooks',
'usage': '!runbook execute <runbook_name>'
}
],
'allowed_users': ['U0000000000', 'U0000000001'],
'allowed_channels': ['C0000000000', 'C0000000001'],
'is_active': True,
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created ChatOps integration: {slack_integration.name}')
else:
self.stdout.write(' Sample ChatOps integrations already exist')
def create_sample_auto_remediations(self):
"""Create sample auto-remediations"""
self.stdout.write('Creating sample auto-remediations...')
admin_user = User.objects.filter(is_superuser=True).first()
# Service restart remediation
service_restart, created = AutoRemediation.objects.get_or_create(
name='Auto Restart Database Service',
defaults={
'description': 'Automatically restart database service when connection issues are detected',
'remediation_type': 'SERVICE_RESTART',
'trigger_conditions': {
'severity': ['CRITICAL', 'EMERGENCY'],
'category': 'database',
'error_patterns': ['connection timeout', 'connection refused', 'database unavailable']
},
'trigger_condition_type': 'CATEGORY',
'remediation_config': {
'service_name': 'postgresql',
'restart_command': 'systemctl restart postgresql',
'verify_command': 'systemctl is-active postgresql',
'max_restart_attempts': 3
},
'timeout_seconds': 300,
'requires_approval': False,
'max_executions_per_incident': 1,
'is_active': True,
'created_by': admin_user
}
)
# Deployment rollback remediation
rollback_remediation, created = AutoRemediation.objects.get_or_create(
name='Auto Rollback Failed Deployment',
defaults={
'description': 'Automatically rollback deployment when critical errors are detected',
'remediation_type': 'DEPLOYMENT_ROLLBACK',
'trigger_conditions': {
'severity': ['CRITICAL', 'EMERGENCY'],
'category': 'deployment',
'error_rate_threshold': 0.1,
'time_window_minutes': 5
},
'trigger_condition_type': 'SEVERITY',
'remediation_config': {
'rollback_to_version': 'previous',
'rollback_command': 'kubectl rollout undo deployment/web-app',
'verify_command': 'kubectl get pods -l app=web-app',
'notification_channels': ['slack', 'email']
},
'timeout_seconds': 600,
'requires_approval': True,
'max_executions_per_incident': 1,
'is_active': True,
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created auto-remediation: {service_restart.name}')
self.stdout.write(f' Created auto-remediation: {rollback_remediation.name}')
else:
self.stdout.write(' Sample auto-remediations already exist')
def create_sample_maintenance_windows(self):
"""Create sample maintenance windows"""
self.stdout.write('Creating sample maintenance windows...')
admin_user = User.objects.filter(is_superuser=True).first()
# Weekly maintenance window
weekly_maintenance, created = MaintenanceWindow.objects.get_or_create(
name='Weekly System Maintenance',
defaults={
'description': 'Weekly maintenance window for system updates and patches',
'start_time': timezone.now() + timedelta(days=1),
'end_time': timezone.now() + timedelta(days=1, hours=2),
'timezone': 'UTC',
'affected_services': ['web-app', 'api-service', 'database'],
'affected_components': ['load-balancer', 'cache', 'monitoring'],
'suppress_incident_creation': True,
'suppress_notifications': True,
'suppress_escalations': True,
'status': 'SCHEDULED',
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created maintenance window: {weekly_maintenance.name}')
else:
self.stdout.write(' Sample maintenance windows already exist')
def create_sample_workflow_templates(self):
"""Create sample workflow templates"""
self.stdout.write('Creating sample workflow templates...')
admin_user = User.objects.filter(is_superuser=True).first()
# Incident response workflow
incident_workflow, created = WorkflowTemplate.objects.get_or_create(
name='Standard Incident Response',
defaults={
'description': 'Standard workflow for incident response and resolution',
'template_type': 'INCIDENT_RESPONSE',
'workflow_steps': [
{
'name': 'Initial Assessment',
'action': 'assess_incident',
'conditions': {'severity': ['HIGH', 'CRITICAL', 'EMERGENCY']},
'timeout': 300
},
{
'name': 'Notify Stakeholders',
'action': 'notify_stakeholders',
'conditions': {'auto_notify': True},
'timeout': 60
},
{
'name': 'Execute Runbook',
'action': 'execute_runbook',
'conditions': {'has_runbook': True},
'timeout': 1800
},
{
'name': 'Update Status',
'action': 'update_incident_status',
'conditions': {'always': True},
'timeout': 30
}
],
'input_parameters': [
{'name': 'incident_id', 'type': 'string', 'required': True},
{'name': 'severity', 'type': 'string', 'required': True},
{'name': 'category', 'type': 'string', 'required': False}
],
'output_schema': {
'type': 'object',
'properties': {
'status': {'type': 'string'},
'resolution_time': {'type': 'string'},
'actions_taken': {'type': 'array'}
}
},
'is_public': True,
'created_by': admin_user
}
)
if created:
self.stdout.write(f' Created workflow template: {incident_workflow.name}')
else:
self.stdout.write(' Sample workflow templates already exist')

View File

@@ -0,0 +1,349 @@
# Generated by Django 5.2.6 on 2025-09-18 15:29
import django.core.validators
import django.db.models.deletion
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
('incident_intelligence', '0003_incident_auto_remediation_attempted_and_more'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='AutoRemediation',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200, unique=True)),
('description', models.TextField()),
('remediation_type', models.CharField(choices=[('SERVICE_RESTART', 'Service Restart'), ('DEPLOYMENT_ROLLBACK', 'Deployment Rollback'), ('SCALE_UP', 'Scale Up Resources'), ('SCALE_DOWN', 'Scale Down Resources'), ('CACHE_CLEAR', 'Clear Cache'), ('CONFIG_UPDATE', 'Configuration Update'), ('CUSTOM_SCRIPT', 'Custom Script'), ('WEBHOOK', 'Webhook Call')], max_length=30)),
('trigger_conditions', models.JSONField(default=dict, help_text='Conditions that trigger this remediation')),
('trigger_condition_type', models.CharField(choices=[('SEVERITY', 'Incident Severity'), ('CATEGORY', 'Incident Category'), ('SERVICE', 'Affected Service'), ('DURATION', 'Incident Duration'), ('PATTERN', 'Pattern Match')], max_length=20)),
('remediation_config', models.JSONField(default=dict, help_text='Configuration for the remediation action')),
('timeout_seconds', models.PositiveIntegerField(default=300, help_text='Timeout for remediation action')),
('requires_approval', models.BooleanField(default=False, help_text='Whether manual approval is required')),
('max_executions_per_incident', models.PositiveIntegerField(default=1, help_text='Max times this can run per incident')),
('is_active', models.BooleanField(default=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('execution_count', models.PositiveIntegerField(default=0)),
('success_count', models.PositiveIntegerField(default=0)),
('last_executed_at', models.DateTimeField(blank=True, null=True)),
('approval_users', models.ManyToManyField(blank=True, help_text='Users who can approve this remediation', related_name='approvable_remediations', to=settings.AUTH_USER_MODEL)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_auto_remediations', to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['name'],
},
),
migrations.CreateModel(
name='AutoRemediationExecution',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('status', models.CharField(choices=[('PENDING', 'Pending'), ('APPROVED', 'Approved'), ('EXECUTING', 'Executing'), ('COMPLETED', 'Completed'), ('FAILED', 'Failed'), ('CANCELLED', 'Cancelled'), ('TIMEOUT', 'Timeout'), ('REJECTED', 'Rejected')], default='PENDING', max_length=20)),
('trigger_data', models.JSONField(default=dict, help_text='Data that triggered the remediation')),
('approved_at', models.DateTimeField(blank=True, null=True)),
('approval_notes', models.TextField(blank=True, null=True)),
('execution_log', models.JSONField(default=list, help_text='Detailed execution log')),
('output_data', models.JSONField(default=dict, help_text='Output data from remediation')),
('error_message', models.TextField(blank=True, null=True)),
('triggered_at', models.DateTimeField(auto_now_add=True)),
('started_at', models.DateTimeField(blank=True, null=True)),
('completed_at', models.DateTimeField(blank=True, null=True)),
('duration', models.DurationField(blank=True, null=True)),
('approved_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='approved_remediations', to=settings.AUTH_USER_MODEL)),
('auto_remediation', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='executions', to='automation_orchestration.autoremediation')),
('incident', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='auto_remediations', to='incident_intelligence.incident')),
],
options={
'ordering': ['-triggered_at'],
},
),
migrations.CreateModel(
name='ChatOpsIntegration',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200, unique=True)),
('platform', models.CharField(choices=[('SLACK', 'Slack'), ('TEAMS', 'Microsoft Teams'), ('DISCORD', 'Discord'), ('MATTERMOST', 'Mattermost')], max_length=20)),
('webhook_url', models.URLField(help_text='Webhook URL for the chat platform')),
('bot_token', models.CharField(help_text='Bot authentication token', max_length=500)),
('channel_id', models.CharField(help_text='Default channel ID', max_length=100)),
('command_prefix', models.CharField(default='!', help_text='Command prefix (e.g., !, /)', max_length=10)),
('available_commands', models.JSONField(default=list, help_text='List of available commands and their descriptions')),
('allowed_users', models.JSONField(default=list, help_text='List of user IDs allowed to use commands')),
('allowed_channels', models.JSONField(default=list, help_text='List of channel IDs where commands are allowed')),
('is_active', models.BooleanField(default=True)),
('last_activity', models.DateTimeField(blank=True, null=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['name'],
},
),
migrations.CreateModel(
name='Integration',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200, unique=True)),
('integration_type', models.CharField(choices=[('JIRA', 'Jira'), ('GITHUB', 'GitHub'), ('JENKINS', 'Jenkins'), ('SERVICENOW', 'ServiceNow'), ('ANSIBLE', 'Ansible'), ('TERRAFORM', 'Terraform'), ('SLACK', 'Slack'), ('TEAMS', 'Microsoft Teams'), ('WEBHOOK', 'Generic Webhook'), ('API', 'Generic API')], max_length=20)),
('description', models.TextField(blank=True, null=True)),
('configuration', models.JSONField(default=dict, help_text='Integration-specific configuration (API keys, URLs, etc.)')),
('authentication_config', models.JSONField(default=dict, help_text='Authentication configuration (OAuth, API keys, etc.)')),
('status', models.CharField(choices=[('ACTIVE', 'Active'), ('INACTIVE', 'Inactive'), ('ERROR', 'Error'), ('CONFIGURING', 'Configuring')], default='CONFIGURING', max_length=20)),
('last_health_check', models.DateTimeField(blank=True, null=True)),
('health_status', models.CharField(choices=[('HEALTHY', 'Healthy'), ('WARNING', 'Warning'), ('ERROR', 'Error'), ('UNKNOWN', 'Unknown')], default='UNKNOWN', max_length=20)),
('error_message', models.TextField(blank=True, null=True)),
('request_count', models.PositiveIntegerField(default=0)),
('last_used_at', models.DateTimeField(blank=True, null=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['name'],
},
),
migrations.CreateModel(
name='MaintenanceWindow',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200)),
('description', models.TextField()),
('start_time', models.DateTimeField(help_text='When maintenance window starts')),
('end_time', models.DateTimeField(help_text='When maintenance window ends')),
('timezone', models.CharField(default='UTC', max_length=50)),
('affected_services', models.JSONField(default=list, help_text='List of services affected by this maintenance')),
('affected_components', models.JSONField(default=list, help_text='List of components affected by this maintenance')),
('suppress_incident_creation', models.BooleanField(default=True)),
('suppress_notifications', models.BooleanField(default=True)),
('suppress_escalations', models.BooleanField(default=True)),
('status', models.CharField(choices=[('SCHEDULED', 'Scheduled'), ('ACTIVE', 'Active'), ('COMPLETED', 'Completed'), ('CANCELLED', 'Cancelled')], default='SCHEDULED', max_length=20)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('incidents_suppressed', models.PositiveIntegerField(default=0)),
('notifications_suppressed', models.PositiveIntegerField(default=0)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['start_time'],
},
),
migrations.CreateModel(
name='Runbook',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200, unique=True)),
('description', models.TextField()),
('version', models.CharField(default='1.0', max_length=20)),
('trigger_type', models.CharField(choices=[('MANUAL', 'Manual Trigger'), ('AUTOMATIC', 'Automatic Trigger'), ('SCHEDULED', 'Scheduled Trigger'), ('WEBHOOK', 'Webhook Trigger'), ('CHATOPS', 'ChatOps Trigger')], default='MANUAL', max_length=20)),
('trigger_conditions', models.JSONField(default=dict, help_text='Conditions that trigger this runbook (incident severity, category, etc.)')),
('steps', models.JSONField(default=list, help_text='List of steps to execute in order')),
('estimated_duration', models.DurationField(help_text='Estimated time to complete')),
('category', models.CharField(blank=True, max_length=100, null=True)),
('tags', models.JSONField(default=list, help_text='Tags for categorization and search')),
('status', models.CharField(choices=[('DRAFT', 'Draft'), ('ACTIVE', 'Active'), ('INACTIVE', 'Inactive'), ('DEPRECATED', 'Deprecated')], default='DRAFT', max_length=20)),
('is_public', models.BooleanField(default=True, help_text='Whether this runbook is available to all users')),
('execution_count', models.PositiveIntegerField(default=0)),
('success_rate', models.FloatField(default=0.0, help_text='Success rate of runbook executions (0.0-1.0)', validators=[django.core.validators.MinValueValidator(0.0), django.core.validators.MaxValueValidator(1.0)])),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('last_executed_at', models.DateTimeField(blank=True, null=True)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_runbooks', to=settings.AUTH_USER_MODEL)),
('last_modified_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='modified_runbooks', to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['name'],
},
),
migrations.CreateModel(
name='ChatOpsCommand',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('command', models.CharField(help_text='The command that was executed', max_length=100)),
('arguments', models.JSONField(default=list, help_text='Command arguments')),
('user_id', models.CharField(help_text='User ID from chat platform', max_length=100)),
('channel_id', models.CharField(help_text='Channel ID where command was executed', max_length=100)),
('status', models.CharField(choices=[('PENDING', 'Pending'), ('EXECUTING', 'Executing'), ('COMPLETED', 'Completed'), ('FAILED', 'Failed'), ('CANCELLED', 'Cancelled')], default='PENDING', max_length=20)),
('response_message', models.TextField(blank=True, null=True)),
('execution_log', models.JSONField(default=list, help_text='Detailed execution log')),
('error_message', models.TextField(blank=True, null=True)),
('executed_at', models.DateTimeField(auto_now_add=True)),
('completed_at', models.DateTimeField(blank=True, null=True)),
('related_incident', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='chatops_commands', to='incident_intelligence.incident')),
('chatops_integration', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='commands', to='automation_orchestration.chatopsintegration')),
('triggered_runbook', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='chatops_triggers', to='automation_orchestration.runbook')),
],
options={
'ordering': ['-executed_at'],
},
),
migrations.CreateModel(
name='RunbookExecution',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('trigger_type', models.CharField(choices=[('MANUAL', 'Manual Trigger'), ('AUTOMATIC', 'Automatic Trigger'), ('SCHEDULED', 'Scheduled Trigger'), ('WEBHOOK', 'Webhook Trigger'), ('CHATOPS', 'ChatOps Trigger')], max_length=20)),
('trigger_data', models.JSONField(default=dict, help_text='Data that triggered the execution')),
('status', models.CharField(choices=[('PENDING', 'Pending'), ('RUNNING', 'Running'), ('COMPLETED', 'Completed'), ('FAILED', 'Failed'), ('CANCELLED', 'Cancelled'), ('TIMEOUT', 'Timeout')], default='PENDING', max_length=20)),
('current_step', models.PositiveIntegerField(default=0)),
('total_steps', models.PositiveIntegerField()),
('execution_log', models.JSONField(default=list, help_text='Detailed execution log')),
('error_message', models.TextField(blank=True, null=True)),
('output_data', models.JSONField(default=dict, help_text='Output data from execution')),
('started_at', models.DateTimeField(auto_now_add=True)),
('completed_at', models.DateTimeField(blank=True, null=True)),
('duration', models.DurationField(blank=True, null=True)),
('incident', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='runbook_executions', to='incident_intelligence.incident')),
('runbook', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='executions', to='automation_orchestration.runbook')),
('triggered_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['-started_at'],
},
),
migrations.CreateModel(
name='WorkflowTemplate',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200, unique=True)),
('description', models.TextField()),
('template_type', models.CharField(choices=[('INCIDENT_RESPONSE', 'Incident Response'), ('DEPLOYMENT', 'Deployment'), ('MAINTENANCE', 'Maintenance'), ('SCALING', 'Scaling'), ('MONITORING', 'Monitoring'), ('CUSTOM', 'Custom')], max_length=30)),
('workflow_steps', models.JSONField(default=list, help_text='List of workflow steps with conditions and actions')),
('input_parameters', models.JSONField(default=list, help_text='Required input parameters for the workflow')),
('output_schema', models.JSONField(default=dict, help_text='Expected output schema')),
('usage_count', models.PositiveIntegerField(default=0)),
('is_public', models.BooleanField(default=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['name'],
},
),
migrations.CreateModel(
name='WorkflowExecution',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(help_text='Name for this execution instance', max_length=200)),
('trigger_type', models.CharField(choices=[('MANUAL', 'Manual Trigger'), ('AUTOMATIC', 'Automatic Trigger'), ('SCHEDULED', 'Scheduled Trigger'), ('WEBHOOK', 'Webhook Trigger'), ('CHATOPS', 'ChatOps Trigger')], max_length=20)),
('status', models.CharField(choices=[('PENDING', 'Pending'), ('RUNNING', 'Running'), ('COMPLETED', 'Completed'), ('FAILED', 'Failed'), ('CANCELLED', 'Cancelled'), ('PAUSED', 'Paused')], default='PENDING', max_length=20)),
('current_step', models.PositiveIntegerField(default=0)),
('total_steps', models.PositiveIntegerField()),
('input_data', models.JSONField(default=dict, help_text='Input data for the workflow')),
('output_data', models.JSONField(default=dict, help_text='Output data from the workflow')),
('execution_log', models.JSONField(default=list, help_text='Detailed execution log')),
('error_message', models.TextField(blank=True, null=True)),
('started_at', models.DateTimeField(auto_now_add=True)),
('completed_at', models.DateTimeField(blank=True, null=True)),
('duration', models.DurationField(blank=True, null=True)),
('related_incident', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='workflow_executions', to='incident_intelligence.incident')),
('related_maintenance', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='workflow_executions', to='automation_orchestration.maintenancewindow')),
('triggered_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)),
('workflow_template', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='executions', to='automation_orchestration.workflowtemplate')),
],
options={
'ordering': ['-started_at'],
},
),
migrations.AddIndex(
model_name='autoremediation',
index=models.Index(fields=['remediation_type', 'is_active'], name='automation__remedia_3c1fa8_idx'),
),
migrations.AddIndex(
model_name='autoremediation',
index=models.Index(fields=['trigger_condition_type'], name='automation__trigger_264d7b_idx'),
),
migrations.AddIndex(
model_name='autoremediationexecution',
index=models.Index(fields=['auto_remediation', 'status'], name='automation__auto_re_e8a9e2_idx'),
),
migrations.AddIndex(
model_name='autoremediationexecution',
index=models.Index(fields=['incident', 'status'], name='automation__inciden_a63d49_idx'),
),
migrations.AddIndex(
model_name='autoremediationexecution',
index=models.Index(fields=['triggered_at'], name='automation__trigger_8ef9fa_idx'),
),
migrations.AddIndex(
model_name='chatopsintegration',
index=models.Index(fields=['platform', 'is_active'], name='automation__platfor_7d4e29_idx'),
),
migrations.AddIndex(
model_name='integration',
index=models.Index(fields=['integration_type', 'status'], name='automation__integra_6734a8_idx'),
),
migrations.AddIndex(
model_name='integration',
index=models.Index(fields=['status', 'health_status'], name='automation__status_30ecd5_idx'),
),
migrations.AddIndex(
model_name='maintenancewindow',
index=models.Index(fields=['start_time', 'end_time'], name='automation__start_t_b3c4cd_idx'),
),
migrations.AddIndex(
model_name='maintenancewindow',
index=models.Index(fields=['status'], name='automation__status_da957b_idx'),
),
migrations.AddIndex(
model_name='runbook',
index=models.Index(fields=['status', 'trigger_type'], name='automation__status_bfcafe_idx'),
),
migrations.AddIndex(
model_name='runbook',
index=models.Index(fields=['category'], name='automation__categor_dd8bc8_idx'),
),
migrations.AddIndex(
model_name='runbook',
index=models.Index(fields=['created_at'], name='automation__created_ad879a_idx'),
),
migrations.AddIndex(
model_name='chatopscommand',
index=models.Index(fields=['chatops_integration', 'status'], name='automation__chatops_3b0b3a_idx'),
),
migrations.AddIndex(
model_name='chatopscommand',
index=models.Index(fields=['user_id', 'executed_at'], name='automation__user_id_390588_idx'),
),
migrations.AddIndex(
model_name='chatopscommand',
index=models.Index(fields=['channel_id', 'executed_at'], name='automation__channel_35c09f_idx'),
),
migrations.AddIndex(
model_name='runbookexecution',
index=models.Index(fields=['runbook', 'status'], name='automation__runbook_534aaf_idx'),
),
migrations.AddIndex(
model_name='runbookexecution',
index=models.Index(fields=['triggered_by', 'started_at'], name='automation__trigger_05e907_idx'),
),
migrations.AddIndex(
model_name='runbookexecution',
index=models.Index(fields=['incident', 'status'], name='automation__inciden_4231a4_idx'),
),
migrations.AddIndex(
model_name='workflowtemplate',
index=models.Index(fields=['template_type', 'is_public'], name='automation__templat_3aecbb_idx'),
),
migrations.AddIndex(
model_name='workflowexecution',
index=models.Index(fields=['workflow_template', 'status'], name='automation__workflo_1a0d89_idx'),
),
migrations.AddIndex(
model_name='workflowexecution',
index=models.Index(fields=['triggered_by', 'started_at'], name='automation__trigger_072811_idx'),
),
migrations.AddIndex(
model_name='workflowexecution',
index=models.Index(fields=['related_incident', 'status'], name='automation__related_08164b_idx'),
),
]

View File

@@ -0,0 +1,25 @@
# Generated by Django 5.2.6 on 2025-09-18 15:51
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('automation_orchestration', '0001_initial'),
('sla_oncall', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='autoremediationexecution',
name='sla_instance',
field=models.ForeignKey(blank=True, help_text='SLA instance related to this auto-remediation', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='auto_remediations', to='sla_oncall.slainstance'),
),
migrations.AddField(
model_name='runbookexecution',
name='sla_instance',
field=models.ForeignKey(blank=True, help_text='SLA instance that triggered this runbook execution', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='runbook_executions', to='sla_oncall.slainstance'),
),
]

View File

@@ -0,0 +1,680 @@
"""
Automation & Orchestration models for Enterprise Incident Management API
Implements runbooks, integrations, ChatOps, auto-remediation, and maintenance scheduling
"""
import uuid
import json
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, List
from django.db import models
from django.contrib.auth import get_user_model
from django.core.validators import MinValueValidator, MaxValueValidator
from django.utils import timezone
from django.core.exceptions import ValidationError
User = get_user_model()
class Runbook(models.Model):
"""Predefined response steps for incident automation"""
TRIGGER_TYPES = [
('MANUAL', 'Manual Trigger'),
('AUTOMATIC', 'Automatic Trigger'),
('SCHEDULED', 'Scheduled Trigger'),
('WEBHOOK', 'Webhook Trigger'),
('CHATOPS', 'ChatOps Trigger'),
]
STATUS_CHOICES = [
('DRAFT', 'Draft'),
('ACTIVE', 'Active'),
('INACTIVE', 'Inactive'),
('DEPRECATED', 'Deprecated'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200, unique=True)
description = models.TextField()
version = models.CharField(max_length=20, default='1.0')
# Trigger configuration
trigger_type = models.CharField(max_length=20, choices=TRIGGER_TYPES, default='MANUAL')
trigger_conditions = models.JSONField(
default=dict,
help_text="Conditions that trigger this runbook (incident severity, category, etc.)"
)
# Runbook content
steps = models.JSONField(
default=list,
help_text="List of steps to execute in order"
)
estimated_duration = models.DurationField(help_text="Estimated time to complete")
# Categorization
category = models.CharField(max_length=100, blank=True, null=True)
tags = models.JSONField(default=list, help_text="Tags for categorization and search")
# Status and metadata
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='DRAFT')
is_public = models.BooleanField(default=True, help_text="Whether this runbook is available to all users")
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True, related_name='created_runbooks')
last_modified_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True, related_name='modified_runbooks')
# Execution tracking
execution_count = models.PositiveIntegerField(default=0)
success_rate = models.FloatField(
validators=[MinValueValidator(0.0), MaxValueValidator(1.0)],
default=0.0,
help_text="Success rate of runbook executions (0.0-1.0)"
)
# Timestamps
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
last_executed_at = models.DateTimeField(null=True, blank=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['status', 'trigger_type']),
models.Index(fields=['category']),
models.Index(fields=['created_at']),
]
def __str__(self):
return f"{self.name} v{self.version}"
def can_be_triggered_by(self, user: User) -> bool:
"""Check if user can trigger this runbook"""
if not self.is_public and self.created_by != user:
return False
return self.status == 'ACTIVE'
class RunbookExecution(models.Model):
"""Execution log for runbook runs"""
STATUS_CHOICES = [
('PENDING', 'Pending'),
('RUNNING', 'Running'),
('COMPLETED', 'Completed'),
('FAILED', 'Failed'),
('CANCELLED', 'Cancelled'),
('TIMEOUT', 'Timeout'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
runbook = models.ForeignKey(Runbook, on_delete=models.CASCADE, related_name='executions')
# Execution context
triggered_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
trigger_type = models.CharField(max_length=20, choices=Runbook.TRIGGER_TYPES)
trigger_data = models.JSONField(default=dict, help_text="Data that triggered the execution")
# Related incident (if applicable)
incident = models.ForeignKey(
'incident_intelligence.Incident',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='runbook_executions'
)
# SLA Integration
sla_instance = models.ForeignKey(
'sla_oncall.SLAInstance',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='runbook_executions',
help_text="SLA instance that triggered this runbook execution"
)
# Execution details
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='PENDING')
current_step = models.PositiveIntegerField(default=0)
total_steps = models.PositiveIntegerField()
# Results
execution_log = models.JSONField(default=list, help_text="Detailed execution log")
error_message = models.TextField(blank=True, null=True)
output_data = models.JSONField(default=dict, help_text="Output data from execution")
# Performance metrics
started_at = models.DateTimeField(auto_now_add=True)
completed_at = models.DateTimeField(null=True, blank=True)
duration = models.DurationField(null=True, blank=True)
class Meta:
ordering = ['-started_at']
indexes = [
models.Index(fields=['runbook', 'status']),
models.Index(fields=['triggered_by', 'started_at']),
models.Index(fields=['incident', 'status']),
]
def __str__(self):
return f"Execution of {self.runbook.name} - {self.status}"
@property
def is_running(self):
return self.status == 'RUNNING'
@property
def is_completed(self):
return self.status in ['COMPLETED', 'FAILED', 'CANCELLED', 'TIMEOUT']
class Integration(models.Model):
"""External system integrations (ITSM/CI/CD tools)"""
INTEGRATION_TYPES = [
('JIRA', 'Jira'),
('GITHUB', 'GitHub'),
('JENKINS', 'Jenkins'),
('SERVICENOW', 'ServiceNow'),
('ANSIBLE', 'Ansible'),
('TERRAFORM', 'Terraform'),
('SLACK', 'Slack'),
('TEAMS', 'Microsoft Teams'),
('WEBHOOK', 'Generic Webhook'),
('API', 'Generic API'),
]
STATUS_CHOICES = [
('ACTIVE', 'Active'),
('INACTIVE', 'Inactive'),
('ERROR', 'Error'),
('CONFIGURING', 'Configuring'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200, unique=True)
integration_type = models.CharField(max_length=20, choices=INTEGRATION_TYPES)
description = models.TextField(blank=True, null=True)
# Configuration
configuration = models.JSONField(
default=dict,
help_text="Integration-specific configuration (API keys, URLs, etc.)"
)
authentication_config = models.JSONField(
default=dict,
help_text="Authentication configuration (OAuth, API keys, etc.)"
)
# Status and health
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='CONFIGURING')
last_health_check = models.DateTimeField(null=True, blank=True)
health_status = models.CharField(
max_length=20,
choices=[
('HEALTHY', 'Healthy'),
('WARNING', 'Warning'),
('ERROR', 'Error'),
('UNKNOWN', 'Unknown'),
],
default='UNKNOWN'
)
error_message = models.TextField(blank=True, null=True)
# Usage tracking
request_count = models.PositiveIntegerField(default=0)
last_used_at = models.DateTimeField(null=True, blank=True)
# Metadata
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['integration_type', 'status']),
models.Index(fields=['status', 'health_status']),
]
def __str__(self):
return f"{self.name} ({self.integration_type})"
def is_healthy(self) -> bool:
"""Check if integration is healthy and ready to use"""
return self.status == 'ACTIVE' and self.health_status == 'HEALTHY'
class ChatOpsIntegration(models.Model):
"""ChatOps integration for triggering workflows from chat platforms"""
PLATFORM_CHOICES = [
('SLACK', 'Slack'),
('TEAMS', 'Microsoft Teams'),
('DISCORD', 'Discord'),
('MATTERMOST', 'Mattermost'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200, unique=True)
platform = models.CharField(max_length=20, choices=PLATFORM_CHOICES)
# Platform configuration
webhook_url = models.URLField(help_text="Webhook URL for the chat platform")
bot_token = models.CharField(max_length=500, help_text="Bot authentication token")
channel_id = models.CharField(max_length=100, help_text="Default channel ID")
# Command configuration
command_prefix = models.CharField(max_length=10, default='!', help_text="Command prefix (e.g., !, /)")
available_commands = models.JSONField(
default=list,
help_text="List of available commands and their descriptions"
)
# Security
allowed_users = models.JSONField(
default=list,
help_text="List of user IDs allowed to use commands"
)
allowed_channels = models.JSONField(
default=list,
help_text="List of channel IDs where commands are allowed"
)
# Status
is_active = models.BooleanField(default=True)
last_activity = models.DateTimeField(null=True, blank=True)
# Metadata
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['platform', 'is_active']),
]
def __str__(self):
return f"{self.name} ({self.platform})"
class ChatOpsCommand(models.Model):
"""Individual ChatOps commands and their execution"""
STATUS_CHOICES = [
('PENDING', 'Pending'),
('EXECUTING', 'Executing'),
('COMPLETED', 'Completed'),
('FAILED', 'Failed'),
('CANCELLED', 'Cancelled'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
chatops_integration = models.ForeignKey(ChatOpsIntegration, on_delete=models.CASCADE, related_name='commands')
# Command details
command = models.CharField(max_length=100, help_text="The command that was executed")
arguments = models.JSONField(default=list, help_text="Command arguments")
user_id = models.CharField(max_length=100, help_text="User ID from chat platform")
channel_id = models.CharField(max_length=100, help_text="Channel ID where command was executed")
# Execution context
triggered_runbook = models.ForeignKey(
Runbook,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='chatops_triggers'
)
related_incident = models.ForeignKey(
'incident_intelligence.Incident',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='chatops_commands'
)
# Execution results
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='PENDING')
response_message = models.TextField(blank=True, null=True)
execution_log = models.JSONField(default=list, help_text="Detailed execution log")
error_message = models.TextField(blank=True, null=True)
# Timestamps
executed_at = models.DateTimeField(auto_now_add=True)
completed_at = models.DateTimeField(null=True, blank=True)
class Meta:
ordering = ['-executed_at']
indexes = [
models.Index(fields=['chatops_integration', 'status']),
models.Index(fields=['user_id', 'executed_at']),
models.Index(fields=['channel_id', 'executed_at']),
]
def __str__(self):
return f"{self.command} by {self.user_id} - {self.status}"
class AutoRemediation(models.Model):
"""Auto-remediation hooks for automatic incident response"""
REMEDIATION_TYPES = [
('SERVICE_RESTART', 'Service Restart'),
('DEPLOYMENT_ROLLBACK', 'Deployment Rollback'),
('SCALE_UP', 'Scale Up Resources'),
('SCALE_DOWN', 'Scale Down Resources'),
('CACHE_CLEAR', 'Clear Cache'),
('CONFIG_UPDATE', 'Configuration Update'),
('CUSTOM_SCRIPT', 'Custom Script'),
('WEBHOOK', 'Webhook Call'),
]
TRIGGER_CONDITIONS = [
('SEVERITY', 'Incident Severity'),
('CATEGORY', 'Incident Category'),
('SERVICE', 'Affected Service'),
('DURATION', 'Incident Duration'),
('PATTERN', 'Pattern Match'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200, unique=True)
description = models.TextField()
remediation_type = models.CharField(max_length=30, choices=REMEDIATION_TYPES)
# Trigger configuration
trigger_conditions = models.JSONField(
default=dict,
help_text="Conditions that trigger this remediation"
)
trigger_condition_type = models.CharField(max_length=20, choices=TRIGGER_CONDITIONS)
# Remediation configuration
remediation_config = models.JSONField(
default=dict,
help_text="Configuration for the remediation action"
)
timeout_seconds = models.PositiveIntegerField(default=300, help_text="Timeout for remediation action")
# Safety and approval
requires_approval = models.BooleanField(default=False, help_text="Whether manual approval is required")
approval_users = models.ManyToManyField(User, blank=True, related_name='approvable_remediations', help_text="Users who can approve this remediation")
max_executions_per_incident = models.PositiveIntegerField(default=1, help_text="Max times this can run per incident")
# Status and metadata
is_active = models.BooleanField(default=True)
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True, related_name='created_auto_remediations')
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
# Execution tracking
execution_count = models.PositiveIntegerField(default=0)
success_count = models.PositiveIntegerField(default=0)
last_executed_at = models.DateTimeField(null=True, blank=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['remediation_type', 'is_active']),
models.Index(fields=['trigger_condition_type']),
]
def __str__(self):
return f"{self.name} ({self.remediation_type})"
@property
def success_rate(self):
if self.execution_count == 0:
return 0.0
return self.success_count / self.execution_count
class AutoRemediationExecution(models.Model):
"""Execution log for auto-remediation actions"""
STATUS_CHOICES = [
('PENDING', 'Pending'),
('APPROVED', 'Approved'),
('EXECUTING', 'Executing'),
('COMPLETED', 'Completed'),
('FAILED', 'Failed'),
('CANCELLED', 'Cancelled'),
('TIMEOUT', 'Timeout'),
('REJECTED', 'Rejected'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
auto_remediation = models.ForeignKey(AutoRemediation, on_delete=models.CASCADE, related_name='executions')
# Related incident
incident = models.ForeignKey(
'incident_intelligence.Incident',
on_delete=models.CASCADE,
related_name='auto_remediations'
)
# SLA Integration
sla_instance = models.ForeignKey(
'sla_oncall.SLAInstance',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='auto_remediations',
help_text="SLA instance related to this auto-remediation"
)
# Execution details
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='PENDING')
trigger_data = models.JSONField(default=dict, help_text="Data that triggered the remediation")
# Approval workflow
approved_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True, blank=True, related_name='approved_remediations')
approved_at = models.DateTimeField(null=True, blank=True)
approval_notes = models.TextField(blank=True, null=True)
# Execution results
execution_log = models.JSONField(default=list, help_text="Detailed execution log")
output_data = models.JSONField(default=dict, help_text="Output data from remediation")
error_message = models.TextField(blank=True, null=True)
# Timestamps
triggered_at = models.DateTimeField(auto_now_add=True)
started_at = models.DateTimeField(null=True, blank=True)
completed_at = models.DateTimeField(null=True, blank=True)
duration = models.DurationField(null=True, blank=True)
class Meta:
ordering = ['-triggered_at']
indexes = [
models.Index(fields=['auto_remediation', 'status']),
models.Index(fields=['incident', 'status']),
models.Index(fields=['triggered_at']),
]
def __str__(self):
return f"Remediation {self.auto_remediation.name} for {self.incident.title} - {self.status}"
class MaintenanceWindow(models.Model):
"""Scheduled maintenance windows to suppress alerts"""
STATUS_CHOICES = [
('SCHEDULED', 'Scheduled'),
('ACTIVE', 'Active'),
('COMPLETED', 'Completed'),
('CANCELLED', 'Cancelled'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200)
description = models.TextField()
# Schedule
start_time = models.DateTimeField(help_text="When maintenance window starts")
end_time = models.DateTimeField(help_text="When maintenance window ends")
timezone = models.CharField(max_length=50, default='UTC')
# Scope
affected_services = models.JSONField(
default=list,
help_text="List of services affected by this maintenance"
)
affected_components = models.JSONField(
default=list,
help_text="List of components affected by this maintenance"
)
# Alert suppression
suppress_incident_creation = models.BooleanField(default=True)
suppress_notifications = models.BooleanField(default=True)
suppress_escalations = models.BooleanField(default=True)
# Status and metadata
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='SCHEDULED')
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
# Execution tracking
incidents_suppressed = models.PositiveIntegerField(default=0)
notifications_suppressed = models.PositiveIntegerField(default=0)
class Meta:
ordering = ['start_time']
indexes = [
models.Index(fields=['start_time', 'end_time']),
models.Index(fields=['status']),
]
def __str__(self):
return f"{self.name} ({self.start_time} - {self.end_time})"
def is_active(self) -> bool:
"""Check if maintenance window is currently active"""
now = timezone.now()
return self.start_time <= now <= self.end_time and self.status == 'ACTIVE'
def is_scheduled(self) -> bool:
"""Check if maintenance window is scheduled for the future"""
now = timezone.now()
return self.start_time > now and self.status == 'SCHEDULED'
def clean(self):
"""Validate maintenance window data"""
if self.start_time >= self.end_time:
raise ValidationError("Start time must be before end time")
class WorkflowTemplate(models.Model):
"""Reusable workflow templates for common automation scenarios"""
TEMPLATE_TYPES = [
('INCIDENT_RESPONSE', 'Incident Response'),
('DEPLOYMENT', 'Deployment'),
('MAINTENANCE', 'Maintenance'),
('SCALING', 'Scaling'),
('MONITORING', 'Monitoring'),
('CUSTOM', 'Custom'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
name = models.CharField(max_length=200, unique=True)
description = models.TextField()
template_type = models.CharField(max_length=30, choices=TEMPLATE_TYPES)
# Template content
workflow_steps = models.JSONField(
default=list,
help_text="List of workflow steps with conditions and actions"
)
input_parameters = models.JSONField(
default=list,
help_text="Required input parameters for the workflow"
)
output_schema = models.JSONField(
default=dict,
help_text="Expected output schema"
)
# Usage and metadata
usage_count = models.PositiveIntegerField(default=0)
is_public = models.BooleanField(default=True)
created_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
class Meta:
ordering = ['name']
indexes = [
models.Index(fields=['template_type', 'is_public']),
]
def __str__(self):
return f"{self.name} ({self.template_type})"
class WorkflowExecution(models.Model):
"""Execution of workflow templates"""
STATUS_CHOICES = [
('PENDING', 'Pending'),
('RUNNING', 'Running'),
('COMPLETED', 'Completed'),
('FAILED', 'Failed'),
('CANCELLED', 'Cancelled'),
('PAUSED', 'Paused'),
]
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
workflow_template = models.ForeignKey(WorkflowTemplate, on_delete=models.CASCADE, related_name='executions')
# Execution context
name = models.CharField(max_length=200, help_text="Name for this execution instance")
triggered_by = models.ForeignKey(User, on_delete=models.SET_NULL, null=True)
trigger_type = models.CharField(max_length=20, choices=Runbook.TRIGGER_TYPES)
# Related objects
related_incident = models.ForeignKey(
'incident_intelligence.Incident',
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='workflow_executions'
)
related_maintenance = models.ForeignKey(
MaintenanceWindow,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name='workflow_executions'
)
# Execution state
status = models.CharField(max_length=20, choices=STATUS_CHOICES, default='PENDING')
current_step = models.PositiveIntegerField(default=0)
total_steps = models.PositiveIntegerField()
# Input/Output
input_data = models.JSONField(default=dict, help_text="Input data for the workflow")
output_data = models.JSONField(default=dict, help_text="Output data from the workflow")
execution_log = models.JSONField(default=list, help_text="Detailed execution log")
error_message = models.TextField(blank=True, null=True)
# Timestamps
started_at = models.DateTimeField(auto_now_add=True)
completed_at = models.DateTimeField(null=True, blank=True)
duration = models.DurationField(null=True, blank=True)
class Meta:
ordering = ['-started_at']
indexes = [
models.Index(fields=['workflow_template', 'status']),
models.Index(fields=['triggered_by', 'started_at']),
models.Index(fields=['related_incident', 'status']),
]
def __str__(self):
return f"Workflow {self.name} - {self.status}"

View File

@@ -0,0 +1,29 @@
"""
Serializers for Automation & Orchestration module
"""
from .automation import (
RunbookSerializer,
RunbookExecutionSerializer,
IntegrationSerializer,
ChatOpsIntegrationSerializer,
ChatOpsCommandSerializer,
AutoRemediationSerializer,
AutoRemediationExecutionSerializer,
MaintenanceWindowSerializer,
WorkflowTemplateSerializer,
WorkflowExecutionSerializer,
)
__all__ = [
'RunbookSerializer',
'RunbookExecutionSerializer',
'IntegrationSerializer',
'ChatOpsIntegrationSerializer',
'ChatOpsCommandSerializer',
'AutoRemediationSerializer',
'AutoRemediationExecutionSerializer',
'MaintenanceWindowSerializer',
'WorkflowTemplateSerializer',
'WorkflowExecutionSerializer',
]

View File

@@ -0,0 +1,308 @@
"""
Serializers for Automation & Orchestration models
"""
from rest_framework import serializers
from django.contrib.auth import get_user_model
from ..models import (
Runbook,
RunbookExecution,
Integration,
ChatOpsIntegration,
ChatOpsCommand,
AutoRemediation,
AutoRemediationExecution,
MaintenanceWindow,
WorkflowTemplate,
WorkflowExecution,
)
User = get_user_model()
class RunbookSerializer(serializers.ModelSerializer):
"""Serializer for Runbook model"""
created_by_username = serializers.CharField(source='created_by.username', read_only=True)
last_modified_by_username = serializers.CharField(source='last_modified_by.username', read_only=True)
can_trigger = serializers.SerializerMethodField()
class Meta:
model = Runbook
fields = [
'id', 'name', 'description', 'version', 'trigger_type', 'trigger_conditions',
'steps', 'estimated_duration', 'category', 'tags', 'status', 'is_public',
'created_by', 'created_by_username', 'last_modified_by', 'last_modified_by_username',
'execution_count', 'success_rate', 'created_at', 'updated_at', 'last_executed_at',
'can_trigger'
]
read_only_fields = [
'id', 'created_by_username', 'last_modified_by_username', 'execution_count',
'success_rate', 'created_at', 'updated_at', 'last_executed_at', 'can_trigger'
]
def get_can_trigger(self, obj):
"""Check if current user can trigger this runbook"""
request = self.context.get('request')
if request and request.user:
return obj.can_be_triggered_by(request.user)
return False
def validate_steps(self, value):
"""Validate runbook steps"""
if not isinstance(value, list):
raise serializers.ValidationError("Steps must be a list")
for i, step in enumerate(value):
if not isinstance(step, dict):
raise serializers.ValidationError(f"Step {i+1} must be a dictionary")
required_fields = ['name', 'action', 'timeout']
for field in required_fields:
if field not in step:
raise serializers.ValidationError(f"Step {i+1} missing required field: {field}")
return value
class RunbookExecutionSerializer(serializers.ModelSerializer):
"""Serializer for RunbookExecution model"""
runbook_name = serializers.CharField(source='runbook.name', read_only=True)
triggered_by_username = serializers.CharField(source='triggered_by.username', read_only=True)
incident_title = serializers.CharField(source='incident.title', read_only=True)
is_running = serializers.BooleanField(read_only=True)
is_completed = serializers.BooleanField(read_only=True)
class Meta:
model = RunbookExecution
fields = [
'id', 'runbook', 'runbook_name', 'triggered_by', 'triggered_by_username',
'trigger_type', 'trigger_data', 'incident', 'incident_title', 'status',
'current_step', 'total_steps', 'execution_log', 'error_message', 'output_data',
'started_at', 'completed_at', 'duration', 'is_running', 'is_completed'
]
read_only_fields = [
'id', 'runbook_name', 'triggered_by_username', 'incident_title',
'is_running', 'is_completed', 'started_at', 'completed_at', 'duration'
]
class IntegrationSerializer(serializers.ModelSerializer):
"""Serializer for Integration model"""
created_by_username = serializers.CharField(source='created_by.username', read_only=True)
is_healthy = serializers.BooleanField(read_only=True)
class Meta:
model = Integration
fields = [
'id', 'name', 'integration_type', 'description', 'configuration',
'authentication_config', 'status', 'last_health_check', 'health_status',
'error_message', 'request_count', 'last_used_at', 'created_by',
'created_by_username', 'created_at', 'updated_at', 'is_healthy'
]
read_only_fields = [
'id', 'created_by_username', 'last_health_check', 'health_status',
'error_message', 'request_count', 'last_used_at', 'created_at',
'updated_at', 'is_healthy'
]
def validate_configuration(self, value):
"""Validate integration configuration"""
if not isinstance(value, dict):
raise serializers.ValidationError("Configuration must be a dictionary")
return value
def validate_authentication_config(self, value):
"""Validate authentication configuration"""
if not isinstance(value, dict):
raise serializers.ValidationError("Authentication configuration must be a dictionary")
return value
class ChatOpsIntegrationSerializer(serializers.ModelSerializer):
"""Serializer for ChatOpsIntegration model"""
created_by_username = serializers.CharField(source='created_by.username', read_only=True)
class Meta:
model = ChatOpsIntegration
fields = [
'id', 'name', 'platform', 'webhook_url', 'bot_token', 'channel_id',
'command_prefix', 'available_commands', 'allowed_users', 'allowed_channels',
'is_active', 'last_activity', 'created_by', 'created_by_username',
'created_at', 'updated_at'
]
read_only_fields = [
'id', 'created_by_username', 'last_activity', 'created_at', 'updated_at'
]
def validate_available_commands(self, value):
"""Validate available commands"""
if not isinstance(value, list):
raise serializers.ValidationError("Available commands must be a list")
for i, command in enumerate(value):
if not isinstance(command, dict):
raise serializers.ValidationError(f"Command {i+1} must be a dictionary")
required_fields = ['name', 'description']
for field in required_fields:
if field not in command:
raise serializers.ValidationError(f"Command {i+1} missing required field: {field}")
return value
class ChatOpsCommandSerializer(serializers.ModelSerializer):
"""Serializer for ChatOpsCommand model"""
chatops_integration_name = serializers.CharField(source='chatops_integration.name', read_only=True)
triggered_runbook_name = serializers.CharField(source='triggered_runbook.name', read_only=True)
related_incident_title = serializers.CharField(source='related_incident.title', read_only=True)
class Meta:
model = ChatOpsCommand
fields = [
'id', 'chatops_integration', 'chatops_integration_name', 'command', 'arguments',
'user_id', 'channel_id', 'triggered_runbook', 'triggered_runbook_name',
'related_incident', 'related_incident_title', 'status', 'response_message',
'execution_log', 'error_message', 'executed_at', 'completed_at'
]
read_only_fields = [
'id', 'chatops_integration_name', 'triggered_runbook_name',
'related_incident_title', 'executed_at', 'completed_at'
]
class AutoRemediationSerializer(serializers.ModelSerializer):
"""Serializer for AutoRemediation model"""
created_by_username = serializers.CharField(source='created_by.username', read_only=True)
approval_users_usernames = serializers.SerializerMethodField()
success_rate = serializers.FloatField(read_only=True)
class Meta:
model = AutoRemediation
fields = [
'id', 'name', 'description', 'remediation_type', 'trigger_conditions',
'trigger_condition_type', 'remediation_config', 'timeout_seconds',
'requires_approval', 'approval_users', 'approval_users_usernames',
'max_executions_per_incident', 'is_active', 'created_by', 'created_by_username',
'created_at', 'updated_at', 'execution_count', 'success_count',
'last_executed_at', 'success_rate'
]
read_only_fields = [
'id', 'created_by_username', 'approval_users_usernames', 'created_at',
'updated_at', 'execution_count', 'success_count', 'last_executed_at', 'success_rate'
]
def get_approval_users_usernames(self, obj):
"""Get usernames of approval users"""
return [user.username for user in obj.approval_users.all()]
class AutoRemediationExecutionSerializer(serializers.ModelSerializer):
"""Serializer for AutoRemediationExecution model"""
auto_remediation_name = serializers.CharField(source='auto_remediation.name', read_only=True)
incident_title = serializers.CharField(source='incident.title', read_only=True)
approved_by_username = serializers.CharField(source='approved_by.username', read_only=True)
class Meta:
model = AutoRemediationExecution
fields = [
'id', 'auto_remediation', 'auto_remediation_name', 'incident', 'incident_title',
'status', 'trigger_data', 'approved_by', 'approved_by_username', 'approved_at',
'approval_notes', 'execution_log', 'output_data', 'error_message',
'triggered_at', 'started_at', 'completed_at', 'duration'
]
read_only_fields = [
'id', 'auto_remediation_name', 'incident_title', 'approved_by_username',
'triggered_at', 'started_at', 'completed_at', 'duration'
]
class MaintenanceWindowSerializer(serializers.ModelSerializer):
"""Serializer for MaintenanceWindow model"""
created_by_username = serializers.CharField(source='created_by.username', read_only=True)
is_active = serializers.BooleanField(read_only=True)
is_scheduled = serializers.BooleanField(read_only=True)
class Meta:
model = MaintenanceWindow
fields = [
'id', 'name', 'description', 'start_time', 'end_time', 'timezone',
'affected_services', 'affected_components', 'suppress_incident_creation',
'suppress_notifications', 'suppress_escalations', 'status', 'created_by',
'created_by_username', 'created_at', 'updated_at', 'incidents_suppressed',
'notifications_suppressed', 'is_active', 'is_scheduled'
]
read_only_fields = [
'id', 'created_by_username', 'created_at', 'updated_at',
'incidents_suppressed', 'notifications_suppressed', 'is_active', 'is_scheduled'
]
def validate(self, data):
"""Validate maintenance window data"""
if data.get('start_time') and data.get('end_time'):
if data['start_time'] >= data['end_time']:
raise serializers.ValidationError("Start time must be before end time")
return data
class WorkflowTemplateSerializer(serializers.ModelSerializer):
"""Serializer for WorkflowTemplate model"""
created_by_username = serializers.CharField(source='created_by.username', read_only=True)
class Meta:
model = WorkflowTemplate
fields = [
'id', 'name', 'description', 'template_type', 'workflow_steps',
'input_parameters', 'output_schema', 'usage_count', 'is_public',
'created_by', 'created_by_username', 'created_at', 'updated_at'
]
read_only_fields = [
'id', 'created_by_username', 'usage_count', 'created_at', 'updated_at'
]
def validate_workflow_steps(self, value):
"""Validate workflow steps"""
if not isinstance(value, list):
raise serializers.ValidationError("Workflow steps must be a list")
for i, step in enumerate(value):
if not isinstance(step, dict):
raise serializers.ValidationError(f"Step {i+1} must be a dictionary")
required_fields = ['name', 'action', 'conditions']
for field in required_fields:
if field not in step:
raise serializers.ValidationError(f"Step {i+1} missing required field: {field}")
return value
class WorkflowExecutionSerializer(serializers.ModelSerializer):
"""Serializer for WorkflowExecution model"""
workflow_template_name = serializers.CharField(source='workflow_template.name', read_only=True)
triggered_by_username = serializers.CharField(source='triggered_by.username', read_only=True)
related_incident_title = serializers.CharField(source='related_incident.title', read_only=True)
related_maintenance_name = serializers.CharField(source='related_maintenance.name', read_only=True)
class Meta:
model = WorkflowExecution
fields = [
'id', 'workflow_template', 'workflow_template_name', 'name', 'triggered_by',
'triggered_by_username', 'trigger_type', 'related_incident', 'related_incident_title',
'related_maintenance', 'related_maintenance_name', 'status', 'current_step',
'total_steps', 'input_data', 'output_data', 'execution_log', 'error_message',
'started_at', 'completed_at', 'duration'
]
read_only_fields = [
'id', 'workflow_template_name', 'triggered_by_username', 'related_incident_title',
'related_maintenance_name', 'started_at', 'completed_at', 'duration'
]

View File

@@ -0,0 +1,63 @@
"""
Signal handlers for automation_orchestration app
"""
from django.db.models.signals import post_save, pre_save
from django.dispatch import receiver
from django.utils import timezone
from .models import (
RunbookExecution,
AutoRemediationExecution,
MaintenanceWindow,
)
@receiver(post_save, sender=RunbookExecution)
def update_runbook_statistics(sender, instance, created, **kwargs):
"""Update runbook statistics when execution is completed"""
if instance.status in ['COMPLETED', 'FAILED', 'CANCELLED', 'TIMEOUT']:
runbook = instance.runbook
runbook.execution_count += 1
if instance.status == 'COMPLETED':
# Update success rate
total_executions = runbook.execution_count
successful_executions = RunbookExecution.objects.filter(
runbook=runbook,
status='COMPLETED'
).count()
runbook.success_rate = successful_executions / total_executions if total_executions > 0 else 0.0
runbook.last_executed_at = instance.started_at
runbook.save(update_fields=['execution_count', 'success_rate', 'last_executed_at'])
@receiver(post_save, sender=AutoRemediationExecution)
def update_auto_remediation_statistics(sender, instance, created, **kwargs):
"""Update auto-remediation statistics when execution is completed"""
if instance.status in ['COMPLETED', 'FAILED', 'CANCELLED', 'TIMEOUT']:
remediation = instance.auto_remediation
remediation.execution_count += 1
if instance.status == 'COMPLETED':
remediation.success_count += 1
remediation.last_executed_at = instance.triggered_at
remediation.save(update_fields=['execution_count', 'success_count', 'last_executed_at'])
@receiver(pre_save, sender=MaintenanceWindow)
def validate_maintenance_window(sender, instance, **kwargs):
"""Validate maintenance window before saving"""
if instance.start_time and instance.end_time:
if instance.start_time >= instance.end_time:
raise ValueError("Start time must be before end time")
# Auto-update status based on current time
now = timezone.now()
if instance.start_time and instance.end_time:
if instance.start_time <= now <= instance.end_time:
if instance.status == 'SCHEDULED':
instance.status = 'ACTIVE'
elif instance.end_time < now:
if instance.status in ['SCHEDULED', 'ACTIVE']:
instance.status = 'COMPLETED'

View File

@@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View File

@@ -0,0 +1,36 @@
"""
URL configuration for automation_orchestration app
"""
from django.urls import path, include
from rest_framework.routers import DefaultRouter
from .views.automation import (
RunbookViewSet,
RunbookExecutionViewSet,
IntegrationViewSet,
ChatOpsIntegrationViewSet,
ChatOpsCommandViewSet,
AutoRemediationViewSet,
AutoRemediationExecutionViewSet,
MaintenanceWindowViewSet,
WorkflowTemplateViewSet,
WorkflowExecutionViewSet,
)
# Create router and register viewsets
router = DefaultRouter()
router.register(r'runbooks', RunbookViewSet)
router.register(r'runbook-executions', RunbookExecutionViewSet)
router.register(r'integrations', IntegrationViewSet)
router.register(r'chatops-integrations', ChatOpsIntegrationViewSet)
router.register(r'chatops-commands', ChatOpsCommandViewSet)
router.register(r'auto-remediations', AutoRemediationViewSet)
router.register(r'auto-remediation-executions', AutoRemediationExecutionViewSet)
router.register(r'maintenance-windows', MaintenanceWindowViewSet)
router.register(r'workflow-templates', WorkflowTemplateViewSet)
router.register(r'workflow-executions', WorkflowExecutionViewSet)
app_name = 'automation_orchestration'
urlpatterns = [
path('', include(router.urls)),
]

View File

@@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View File

@@ -0,0 +1,29 @@
"""
Views for Automation & Orchestration module
"""
from .automation import (
RunbookViewSet,
RunbookExecutionViewSet,
IntegrationViewSet,
ChatOpsIntegrationViewSet,
ChatOpsCommandViewSet,
AutoRemediationViewSet,
AutoRemediationExecutionViewSet,
MaintenanceWindowViewSet,
WorkflowTemplateViewSet,
WorkflowExecutionViewSet,
)
__all__ = [
'RunbookViewSet',
'RunbookExecutionViewSet',
'IntegrationViewSet',
'ChatOpsIntegrationViewSet',
'ChatOpsCommandViewSet',
'AutoRemediationViewSet',
'AutoRemediationExecutionViewSet',
'MaintenanceWindowViewSet',
'WorkflowTemplateViewSet',
'WorkflowExecutionViewSet',
]

View File

@@ -0,0 +1,411 @@
"""
Views for Automation & Orchestration models
"""
from rest_framework import viewsets, status, permissions
from rest_framework.decorators import action
from rest_framework.response import Response
from django_filters.rest_framework import DjangoFilterBackend
from rest_framework.filters import SearchFilter, OrderingFilter
from django.utils import timezone
from django.db.models import Q
from ..models import (
Runbook,
RunbookExecution,
Integration,
ChatOpsIntegration,
ChatOpsCommand,
AutoRemediation,
AutoRemediationExecution,
MaintenanceWindow,
WorkflowTemplate,
WorkflowExecution,
)
from ..serializers.automation import (
RunbookSerializer,
RunbookExecutionSerializer,
IntegrationSerializer,
ChatOpsIntegrationSerializer,
ChatOpsCommandSerializer,
AutoRemediationSerializer,
AutoRemediationExecutionSerializer,
MaintenanceWindowSerializer,
WorkflowTemplateSerializer,
WorkflowExecutionSerializer,
)
class RunbookViewSet(viewsets.ModelViewSet):
"""ViewSet for Runbook model"""
queryset = Runbook.objects.all()
serializer_class = RunbookSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['status', 'trigger_type', 'category', 'is_public']
search_fields = ['name', 'description', 'category']
ordering_fields = ['name', 'created_at', 'updated_at', 'execution_count', 'success_rate']
ordering = ['-created_at']
def get_queryset(self):
"""Filter runbooks based on user permissions"""
queryset = super().get_queryset()
# Filter by public runbooks or user's own runbooks
if not self.request.user.is_staff:
queryset = queryset.filter(
Q(is_public=True) | Q(created_by=self.request.user)
)
return queryset
def perform_create(self, serializer):
"""Set the creator when creating a runbook"""
serializer.save(created_by=self.request.user)
def perform_update(self, serializer):
"""Set the last modifier when updating a runbook"""
serializer.save(last_modified_by=self.request.user)
@action(detail=True, methods=['post'])
def execute(self, request, pk=None):
"""Execute a runbook"""
runbook = self.get_object()
if not runbook.can_be_triggered_by(request.user):
return Response(
{'error': 'You do not have permission to execute this runbook'},
status=status.HTTP_403_FORBIDDEN
)
# Create execution record
execution = RunbookExecution.objects.create(
runbook=runbook,
triggered_by=request.user,
trigger_type='MANUAL',
trigger_data=request.data.get('trigger_data', {}),
total_steps=len(runbook.steps)
)
# TODO: Start actual execution in background task
serializer = RunbookExecutionSerializer(execution, context={'request': request})
return Response(serializer.data, status=status.HTTP_201_CREATED)
@action(detail=False, methods=['get'])
def available_for_trigger(self, request):
"""Get runbooks available for triggering by current user"""
queryset = self.get_queryset().filter(status='ACTIVE')
available_runbooks = [rb for rb in queryset if rb.can_be_triggered_by(request.user)]
serializer = self.get_serializer(available_runbooks, many=True)
return Response(serializer.data)
class RunbookExecutionViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for RunbookExecution model (read-only)"""
queryset = RunbookExecution.objects.all()
serializer_class = RunbookExecutionSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['status', 'trigger_type', 'runbook', 'incident']
search_fields = ['runbook__name']
ordering_fields = ['started_at', 'completed_at', 'duration']
ordering = ['-started_at']
def get_queryset(self):
"""Filter executions based on user permissions"""
queryset = super().get_queryset()
# Users can only see executions they triggered or for incidents they have access to
if not self.request.user.is_staff:
queryset = queryset.filter(
Q(triggered_by=self.request.user) |
Q(incident__assigned_to=self.request.user) |
Q(incident__reporter=self.request.user)
)
return queryset
class IntegrationViewSet(viewsets.ModelViewSet):
"""ViewSet for Integration model"""
queryset = Integration.objects.all()
serializer_class = IntegrationSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['integration_type', 'status', 'health_status']
search_fields = ['name', 'description']
ordering_fields = ['name', 'created_at', 'last_used_at']
ordering = ['name']
def perform_create(self, serializer):
"""Set the creator when creating an integration"""
serializer.save(created_by=self.request.user)
@action(detail=True, methods=['post'])
def test_connection(self, request, pk=None):
"""Test integration connection"""
integration = self.get_object()
# TODO: Implement actual connection testing
# For now, just return a mock response
return Response({
'status': 'success',
'message': f'Connection test for {integration.name} completed',
'health_status': 'HEALTHY'
})
@action(detail=True, methods=['post'])
def health_check(self, request, pk=None):
"""Perform health check on integration"""
integration = self.get_object()
# TODO: Implement actual health check
# For now, just update the timestamp
integration.last_health_check = timezone.now()
integration.health_status = 'HEALTHY'
integration.save()
return Response({
'status': 'success',
'health_status': integration.health_status,
'last_health_check': integration.last_health_check
})
class ChatOpsIntegrationViewSet(viewsets.ModelViewSet):
"""ViewSet for ChatOpsIntegration model"""
queryset = ChatOpsIntegration.objects.all()
serializer_class = ChatOpsIntegrationSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['platform', 'is_active']
search_fields = ['name']
ordering_fields = ['name', 'created_at', 'last_activity']
ordering = ['name']
def perform_create(self, serializer):
"""Set the creator when creating a ChatOps integration"""
serializer.save(created_by=self.request.user)
@action(detail=True, methods=['post'])
def test_webhook(self, request, pk=None):
"""Test ChatOps webhook"""
integration = self.get_object()
# TODO: Implement actual webhook testing
return Response({
'status': 'success',
'message': f'Webhook test for {integration.name} completed'
})
class ChatOpsCommandViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for ChatOpsCommand model (read-only)"""
queryset = ChatOpsCommand.objects.all()
serializer_class = ChatOpsCommandSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['status', 'chatops_integration', 'command']
search_fields = ['command', 'user_id']
ordering_fields = ['executed_at', 'completed_at']
ordering = ['-executed_at']
class AutoRemediationViewSet(viewsets.ModelViewSet):
"""ViewSet for AutoRemediation model"""
queryset = AutoRemediation.objects.all()
serializer_class = AutoRemediationSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['remediation_type', 'trigger_condition_type', 'is_active', 'requires_approval']
search_fields = ['name', 'description']
ordering_fields = ['name', 'created_at', 'execution_count', 'success_count']
ordering = ['name']
def perform_create(self, serializer):
"""Set the creator when creating an auto-remediation"""
serializer.save(created_by=self.request.user)
@action(detail=True, methods=['post'])
def test_trigger(self, request, pk=None):
"""Test auto-remediation trigger conditions"""
remediation = self.get_object()
# TODO: Implement actual trigger testing
return Response({
'status': 'success',
'message': f'Trigger test for {remediation.name} completed',
'trigger_conditions': remediation.trigger_conditions
})
class AutoRemediationExecutionViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for AutoRemediationExecution model (read-only)"""
queryset = AutoRemediationExecution.objects.all()
serializer_class = AutoRemediationExecutionSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['status', 'auto_remediation', 'incident']
search_fields = ['auto_remediation__name', 'incident__title']
ordering_fields = ['triggered_at', 'started_at', 'completed_at']
ordering = ['-triggered_at']
@action(detail=True, methods=['post'])
def approve(self, request, pk=None):
"""Approve a pending auto-remediation execution"""
execution = self.get_object()
if execution.status != 'PENDING':
return Response(
{'error': 'Only pending remediations can be approved'},
status=status.HTTP_400_BAD_REQUEST
)
if not execution.auto_remediation.requires_approval:
return Response(
{'error': 'This remediation does not require approval'},
status=status.HTTP_400_BAD_REQUEST
)
if request.user not in execution.auto_remediation.approval_users.all():
return Response(
{'error': 'You do not have permission to approve this remediation'},
status=status.HTTP_403_FORBIDDEN
)
execution.status = 'APPROVED'
execution.approved_by = request.user
execution.approved_at = timezone.now()
execution.approval_notes = request.data.get('approval_notes', '')
execution.save()
# TODO: Start actual remediation execution
serializer = self.get_serializer(execution)
return Response(serializer.data)
@action(detail=True, methods=['post'])
def reject(self, request, pk=None):
"""Reject a pending auto-remediation execution"""
execution = self.get_object()
if execution.status != 'PENDING':
return Response(
{'error': 'Only pending remediations can be rejected'},
status=status.HTTP_400_BAD_REQUEST
)
execution.status = 'REJECTED'
execution.approved_by = request.user
execution.approved_at = timezone.now()
execution.approval_notes = request.data.get('rejection_notes', '')
execution.save()
serializer = self.get_serializer(execution)
return Response(serializer.data)
class MaintenanceWindowViewSet(viewsets.ModelViewSet):
"""ViewSet for MaintenanceWindow model"""
queryset = MaintenanceWindow.objects.all()
serializer_class = MaintenanceWindowSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['status']
search_fields = ['name', 'description']
ordering_fields = ['name', 'start_time', 'end_time', 'created_at']
ordering = ['start_time']
def perform_create(self, serializer):
"""Set the creator when creating a maintenance window"""
serializer.save(created_by=self.request.user)
@action(detail=False, methods=['get'])
def active(self, request):
"""Get currently active maintenance windows"""
now = timezone.now()
active_windows = self.get_queryset().filter(
start_time__lte=now,
end_time__gte=now,
status='ACTIVE'
)
serializer = self.get_serializer(active_windows, many=True)
return Response(serializer.data)
@action(detail=False, methods=['get'])
def upcoming(self, request):
"""Get upcoming maintenance windows"""
now = timezone.now()
upcoming_windows = self.get_queryset().filter(
start_time__gt=now,
status='SCHEDULED'
)
serializer = self.get_serializer(upcoming_windows, many=True)
return Response(serializer.data)
class WorkflowTemplateViewSet(viewsets.ModelViewSet):
"""ViewSet for WorkflowTemplate model"""
queryset = WorkflowTemplate.objects.all()
serializer_class = WorkflowTemplateSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['template_type', 'is_public']
search_fields = ['name', 'description']
ordering_fields = ['name', 'created_at', 'usage_count']
ordering = ['name']
def get_queryset(self):
"""Filter templates based on user permissions"""
queryset = super().get_queryset()
# Filter by public templates or user's own templates
if not self.request.user.is_staff:
queryset = queryset.filter(
Q(is_public=True) | Q(created_by=self.request.user)
)
return queryset
def perform_create(self, serializer):
"""Set the creator when creating a workflow template"""
serializer.save(created_by=self.request.user)
class WorkflowExecutionViewSet(viewsets.ReadOnlyModelViewSet):
"""ViewSet for WorkflowExecution model (read-only)"""
queryset = WorkflowExecution.objects.all()
serializer_class = WorkflowExecutionSerializer
permission_classes = [permissions.IsAuthenticated]
filter_backends = [DjangoFilterBackend, SearchFilter, OrderingFilter]
filterset_fields = ['status', 'workflow_template', 'trigger_type']
search_fields = ['name', 'workflow_template__name']
ordering_fields = ['started_at', 'completed_at', 'duration']
ordering = ['-started_at']
def get_queryset(self):
"""Filter executions based on user permissions"""
queryset = super().get_queryset()
# Users can only see executions they triggered or for incidents they have access to
if not self.request.user.is_staff:
queryset = queryset.filter(
Q(triggered_by=self.request.user) |
Q(related_incident__assigned_to=self.request.user) |
Q(related_incident__reporter=self.request.user)
)
return queryset

View File

@@ -0,0 +1,586 @@
# Collaboration & War Rooms API Documentation
## Overview
The Collaboration & War Rooms module provides real-time incident collaboration capabilities including war rooms, conference bridges, incident command roles, and timeline reconstruction for postmortems.
## Features
- **Real-time Incident Rooms**: Auto-created Slack/Teams channels per incident
- **Conference Bridge Integration**: Zoom, Teams, Webex integration
- **Incident Command Roles**: Assign Incident Commander, Scribe, Comms Lead
- **Timeline Reconstruction**: Automatically ordered events + human notes for postmortems
## API Endpoints
### War Rooms
#### List War Rooms
```
GET /api/collaboration-war-rooms/war-rooms/
```
**Query Parameters:**
- `status`: Filter by status (ACTIVE, ARCHIVED, CLOSED)
- `privacy_level`: Filter by privacy level (PUBLIC, PRIVATE, RESTRICTED)
- `incident__severity`: Filter by incident severity
- `search`: Search in name, description, incident title
- `ordering`: Order by created_at, last_activity, message_count
**Response:**
```json
{
"count": 10,
"next": null,
"previous": null,
"results": [
{
"id": "uuid",
"name": "Incident 123 - Database Outage",
"incident_title": "Database Outage",
"incident_severity": "CRITICAL",
"status": "ACTIVE",
"privacy_level": "PRIVATE",
"message_count": 45,
"last_activity": "2024-01-15T10:30:00Z",
"participant_count": 5,
"created_at": "2024-01-15T09:00:00Z"
}
]
}
```
#### Create War Room
```
POST /api/collaboration-war-rooms/war-rooms/
```
**Request Body:**
```json
{
"name": "Incident 123 - Database Outage",
"description": "War room for database outage incident",
"incident_id": "uuid",
"privacy_level": "PRIVATE",
"allowed_user_ids": ["uuid1", "uuid2"]
}
```
#### Get War Room Details
```
GET /api/collaboration-war-rooms/war-rooms/{id}/
```
#### Update War Room
```
PUT /api/collaboration-war-rooms/war-rooms/{id}/
PATCH /api/collaboration-war-rooms/war-rooms/{id}/
```
#### Add Participant
```
POST /api/collaboration-war-rooms/war-rooms/{id}/add_participant/
```
**Request Body:**
```json
{
"user_id": "uuid"
}
```
#### Remove Participant
```
POST /api/collaboration-war-rooms/war-rooms/{id}/remove_participant/
```
**Request Body:**
```json
{
"user_id": "uuid"
}
```
#### Get War Room Messages
```
GET /api/collaboration-war-rooms/war-rooms/{id}/messages/
```
### Conference Bridges
#### List Conference Bridges
```
GET /api/collaboration-war-rooms/conference-bridges/
```
**Query Parameters:**
- `bridge_type`: Filter by bridge type (ZOOM, TEAMS, WEBEX, etc.)
- `status`: Filter by status (SCHEDULED, ACTIVE, ENDED, CANCELLED)
- `incident__severity`: Filter by incident severity
- `search`: Search in name, description, incident title
- `ordering`: Order by scheduled_start, created_at
#### Create Conference Bridge
```
POST /api/collaboration-war-rooms/conference-bridges/
```
**Request Body:**
```json
{
"name": "Incident 123 - Database Outage Call",
"description": "Emergency conference call for database outage",
"incident_id": "uuid",
"war_room_id": "uuid",
"bridge_type": "ZOOM",
"scheduled_start": "2024-01-15T10:00:00Z",
"scheduled_end": "2024-01-15T11:00:00Z",
"invited_participant_ids": ["uuid1", "uuid2"],
"recording_enabled": true,
"transcription_enabled": true
}
```
#### Join Conference
```
POST /api/collaboration-war-rooms/conference-bridges/{id}/join_conference/
```
#### Start Conference
```
POST /api/collaboration-war-rooms/conference-bridges/{id}/start_conference/
```
#### End Conference
```
POST /api/collaboration-war-rooms/conference-bridges/{id}/end_conference/
```
### Incident Command Roles
#### List Command Roles
```
GET /api/collaboration-war-rooms/command-roles/
```
**Query Parameters:**
- `role_type`: Filter by role type (INCIDENT_COMMANDER, SCRIBE, COMMS_LEAD, etc.)
- `status`: Filter by status (ACTIVE, INACTIVE, REASSIGNED)
- `incident__severity`: Filter by incident severity
- `search`: Search in incident title, assigned user username
- `ordering`: Order by assigned_at, created_at
#### Create Command Role
```
POST /api/collaboration-war-rooms/command-roles/
```
**Request Body:**
```json
{
"incident_id": "uuid",
"war_room_id": "uuid",
"role_type": "INCIDENT_COMMANDER",
"assigned_user_id": "uuid",
"responsibilities": [
"Overall incident coordination",
"Decision making authority",
"Communication with stakeholders"
],
"decision_authority": [
"TECHNICAL",
"BUSINESS",
"ESCALATION"
]
}
```
#### Reassign Role
```
POST /api/collaboration-war-rooms/command-roles/{id}/reassign_role/
```
**Request Body:**
```json
{
"new_user_id": "uuid",
"notes": "Reassigning due to shift change"
}
```
### Timeline Events
#### List Timeline Events
```
GET /api/collaboration-war-rooms/timeline-events/
```
**Query Parameters:**
- `event_type`: Filter by event type (INCIDENT_CREATED, STATUS_CHANGED, etc.)
- `source_type`: Filter by source type (SYSTEM, USER, INTEGRATION, AUTOMATION)
- `is_critical_event`: Filter critical events for postmortems
- `incident__severity`: Filter by incident severity
- `search`: Search in title, description, incident title
- `ordering`: Order by event_time, created_at
#### Get Critical Events
```
GET /api/collaboration-war-rooms/timeline-events/critical_events/
```
**Response:**
```json
{
"count": 5,
"results": [
{
"id": "uuid",
"incident_title": "Database Outage",
"event_type": "SLA_BREACHED",
"title": "SLA Breached: Response Time",
"description": "SLA 'Response Time' has been breached",
"source_type": "SYSTEM",
"event_time": "2024-01-15T10:15:00Z",
"related_user_name": null,
"is_critical_event": true,
"created_at": "2024-01-15T10:15:00Z"
}
]
}
```
### War Room Messages
#### List Messages
```
GET /api/collaboration-war-rooms/war-room-messages/
```
**Query Parameters:**
- `message_type`: Filter by message type (TEXT, SYSTEM, COMMAND, ALERT, UPDATE)
- `war_room`: Filter by war room ID
- `sender`: Filter by sender ID
- `search`: Search in content, sender name
- `ordering`: Order by created_at
#### Create Message
```
POST /api/collaboration-war-rooms/war-room-messages/
```
**Request Body:**
```json
{
"war_room_id": "uuid",
"message_type": "TEXT",
"content": "Database connection restored. Monitoring for stability.",
"sender_id": "uuid",
"sender_name": "John Doe"
}
```
### Incident Decisions
#### List Decisions
```
GET /api/collaboration-war-rooms/incident-decisions/
```
**Query Parameters:**
- `decision_type`: Filter by decision type (TECHNICAL, BUSINESS, COMMUNICATION, etc.)
- `status`: Filter by status (PENDING, APPROVED, REJECTED, IMPLEMENTED)
- `incident__severity`: Filter by incident severity
- `search`: Search in title, description, incident title
- `ordering`: Order by created_at, approved_at, implemented_at
#### Create Decision
```
POST /api/collaboration-war-rooms/incident-decisions/
```
**Request Body:**
```json
{
"incident_id": "uuid",
"command_role_id": "uuid",
"decision_type": "TECHNICAL",
"title": "Restart Database Cluster",
"description": "Decision to restart the primary database cluster to resolve connection issues",
"rationale": "Multiple connection timeouts indicate cluster instability. Restart should resolve the issue.",
"requires_approval": true
}
```
#### Approve Decision
```
POST /api/collaboration-war-rooms/incident-decisions/{id}/approve_decision/
```
#### Implement Decision
```
POST /api/collaboration-war-rooms/incident-decisions/{id}/implement_decision/
```
**Request Body:**
```json
{
"notes": "Database cluster restarted successfully. All connections restored."
}
```
## Data Models
### WarRoom
- `id`: UUID primary key
- `name`: War room name
- `description`: War room description
- `incident`: Related incident (ForeignKey)
- `status`: ACTIVE, ARCHIVED, CLOSED
- `privacy_level`: PUBLIC, PRIVATE, RESTRICTED
- `slack_channel_id`: Slack channel ID
- `teams_channel_id`: Teams channel ID
- `discord_channel_id`: Discord channel ID
- `allowed_users`: Users with access (ManyToMany)
- `required_clearance_level`: Required security clearance
- `message_count`: Number of messages
- `last_activity`: Last activity timestamp
- `active_participants`: Number of active participants
- `created_by`: Creator (ForeignKey to User)
- `created_at`: Creation timestamp
- `updated_at`: Last update timestamp
- `archived_at`: Archive timestamp
### ConferenceBridge
- `id`: UUID primary key
- `name`: Conference name
- `description`: Conference description
- `incident`: Related incident (ForeignKey)
- `war_room`: Related war room (ForeignKey)
- `bridge_type`: ZOOM, TEAMS, WEBEX, GOTO_MEETING, CUSTOM
- `status`: SCHEDULED, ACTIVE, ENDED, CANCELLED
- `meeting_id`: External meeting ID
- `meeting_url`: Meeting URL
- `dial_in_number`: Dial-in phone number
- `access_code`: Access code for dial-in
- `scheduled_start`: Scheduled start time
- `scheduled_end`: Scheduled end time
- `actual_start`: Actual start time
- `actual_end`: Actual end time
- `invited_participants`: Invited users (ManyToMany)
- `active_participants`: Active users (ManyToMany)
- `max_participants`: Maximum participants
- `recording_enabled`: Recording enabled flag
- `recording_url`: Recording URL
- `transcription_enabled`: Transcription enabled flag
- `transcription_url`: Transcription URL
- `integration_config`: Integration configuration (JSON)
- `created_by`: Creator (ForeignKey to User)
- `created_at`: Creation timestamp
- `updated_at`: Last update timestamp
### IncidentCommandRole
- `id`: UUID primary key
- `incident`: Related incident (ForeignKey)
- `war_room`: Related war room (ForeignKey)
- `role_type`: INCIDENT_COMMANDER, SCRIBE, COMMS_LEAD, TECHNICAL_LEAD, BUSINESS_LEAD, EXTERNAL_LIAISON, OBSERVER
- `assigned_user`: Assigned user (ForeignKey to User)
- `status`: ACTIVE, INACTIVE, REASSIGNED
- `responsibilities`: List of responsibilities (JSON)
- `decision_authority`: Areas of decision authority (JSON)
- `assigned_at`: Assignment timestamp
- `reassigned_at`: Reassignment timestamp
- `reassigned_by`: User who reassigned (ForeignKey to User)
- `assignment_notes`: Assignment notes
- `decisions_made`: Number of decisions made
- `communications_sent`: Number of communications sent
- `last_activity`: Last activity timestamp
- `created_by`: Creator (ForeignKey to User)
- `created_at`: Creation timestamp
- `updated_at`: Last update timestamp
### TimelineEvent
- `id`: UUID primary key
- `incident`: Related incident (ForeignKey)
- `event_type`: Event type (INCIDENT_CREATED, STATUS_CHANGED, etc.)
- `title`: Event title
- `description`: Event description
- `source_type`: SYSTEM, USER, INTEGRATION, AUTOMATION
- `event_time`: When the event occurred
- `created_at`: Creation timestamp
- `related_user`: Related user (ForeignKey to User)
- `related_runbook_execution`: Related runbook execution (ForeignKey)
- `related_auto_remediation`: Related auto-remediation (ForeignKey)
- `related_sla_instance`: Related SLA instance (ForeignKey)
- `related_escalation`: Related escalation (ForeignKey)
- `related_war_room`: Related war room (ForeignKey)
- `related_conference`: Related conference (ForeignKey)
- `related_command_role`: Related command role (ForeignKey)
- `event_data`: Additional event data (JSON)
- `tags`: Event tags (JSON)
- `is_critical_event`: Critical for postmortem flag
- `postmortem_notes`: Postmortem notes
- `created_by`: Creator (ForeignKey to User)
### WarRoomMessage
- `id`: UUID primary key
- `war_room`: Related war room (ForeignKey)
- `message_type`: TEXT, SYSTEM, COMMAND, ALERT, UPDATE
- `content`: Message content
- `sender`: Sender user (ForeignKey to User)
- `sender_name`: Display name of sender
- `is_edited`: Edited flag
- `edited_at`: Edit timestamp
- `reply_to`: Reply to message (ForeignKey to self)
- `external_message_id`: External system message ID
- `external_data`: External system data (JSON)
- `created_at`: Creation timestamp
- `updated_at`: Last update timestamp
### IncidentDecision
- `id`: UUID primary key
- `incident`: Related incident (ForeignKey)
- `command_role`: Related command role (ForeignKey)
- `decision_type`: TECHNICAL, BUSINESS, COMMUNICATION, ESCALATION, RESOURCE, TIMELINE
- `title`: Decision title
- `description`: Decision description
- `rationale`: Decision rationale
- `status`: PENDING, APPROVED, REJECTED, IMPLEMENTED
- `requires_approval`: Requires approval flag
- `approved_by`: Approver (ForeignKey to User)
- `approved_at`: Approval timestamp
- `implementation_notes`: Implementation notes
- `implemented_at`: Implementation timestamp
- `implemented_by`: Implementer (ForeignKey to User)
- `impact_assessment`: Impact assessment
- `success_metrics`: Success metrics (JSON)
- `created_at`: Creation timestamp
- `updated_at`: Last update timestamp
## Integration Points
### Automatic War Room Creation
- War rooms are automatically created when new incidents are created
- Incident reporter and assignee are automatically added as participants
- Timeline events are created for war room creation
### Timeline Event Integration
- Timeline events are automatically created for:
- Incident status changes
- Severity changes
- Assignment changes
- Runbook executions
- Auto-remediation attempts
- SLA breaches
- Escalation triggers
- Command role assignments
### Security Integration
- War room access is controlled by incident access permissions
- Required clearance levels can be set for war rooms
- All actions are logged for audit purposes
### SLA & On-Call Integration
- Conference bridges can be linked to SLA instances
- Command roles can be assigned to on-call personnel
- Timeline events track SLA breaches and escalations
### Automation Integration
- Timeline events are created for runbook executions
- Auto-remediation attempts are tracked in timeline
- War rooms can be integrated with ChatOps platforms
## Error Handling
### Common Error Responses
#### 400 Bad Request
```json
{
"error": "user_id is required"
}
```
#### 403 Forbidden
```json
{
"error": "You do not have permission to join this conference"
}
```
#### 404 Not Found
```json
{
"error": "User not found"
}
```
## Authentication
All endpoints require authentication. Include the authentication token in the request headers:
```
Authorization: Token your-auth-token-here
```
## Rate Limiting
API requests are rate limited to prevent abuse. Standard rate limits apply:
- 1000 requests per hour per user
- 100 requests per minute per user
## Webhooks
The system supports webhooks for real-time notifications:
### War Room Events
- `war_room.created`: War room created
- `war_room.updated`: War room updated
- `war_room.archived`: War room archived
### Conference Events
- `conference.scheduled`: Conference scheduled
- `conference.started`: Conference started
- `conference.ended`: Conference ended
### Timeline Events
- `timeline_event.created`: Timeline event created
- `timeline_event.critical`: Critical timeline event created
### Decision Events
- `decision.created`: Decision created
- `decision.approved`: Decision approved
- `decision.implemented`: Decision implemented
## Examples
### Complete Incident Response Flow
1. **Incident Created** → War room automatically created
2. **Assign Command Roles** → Incident Commander, Scribe, Comms Lead
3. **Schedule Conference** → Emergency call for critical incidents
4. **Make Decisions** → Track all decisions with approval workflow
5. **Timeline Reconstruction** → Automatic + manual events for postmortem
### Integration with External Systems
```python
# Create war room with Slack integration
war_room = WarRoom.objects.create(
name="Incident 123 - Database Outage",
incident=incident,
slack_channel_id="C1234567890"
)
# Create conference bridge with Zoom
conference = ConferenceBridge.objects.create(
name="Emergency Call - Database Outage",
incident=incident,
war_room=war_room,
bridge_type="ZOOM",
scheduled_start=timezone.now() + timedelta(minutes=5),
scheduled_end=timezone.now() + timedelta(hours=1),
recording_enabled=True
)
```
This module provides comprehensive collaboration capabilities for incident response, ensuring effective communication, decision tracking, and postmortem analysis.

View File

@@ -0,0 +1,425 @@
# Incident-Centric Chat API Documentation
## Overview
The Incident-Centric Chat system provides real-time collaboration capabilities for incident response teams. Every incident automatically gets its own chat room with advanced features including pinned messages, reactions, file sharing, ChatOps commands, and AI assistant integration.
## Key Features
### 1. Incident-Centric Chat Rooms
- **Auto-creation**: Chat rooms are automatically created when incidents are created
- **Cross-linking**: Direct links between incident timeline and chat logs
- **Access Control**: RBAC-based access control with security clearance levels
### 2. Collaboration Features
- **@mentions**: Mention users with notifications
- **Threaded Conversations**: Reply to messages for sub-discussions
- **Reactions**: Emoji reactions (👍, 🚨, ✅) for lightweight feedback
- **Pinned Messages**: Pin important updates for easy reference
### 3. Media & Files
- **File Sharing**: Upload logs, screenshots, evidence files
- **Compliance Integration**: Automatic file classification (PUBLIC/CONFIDENTIAL/etc.)
- **Chain of Custody**: File hashing and access logging for evidence
- **Encryption**: Optional encryption for sensitive files
### 4. ChatOps Integration
- **Commands**: Execute automation commands via chat
- **Status Checks**: `/status incident-123` to fetch incident status
- **Runbook Execution**: `/run playbook ransomware-incident`
- **Escalation**: `/escalate` to trigger escalation procedures
### 5. Security Features
- **Encryption**: Chat logs encrypted at rest and in transit
- **Audit Trail**: Immutable audit trail for compliance
- **RBAC**: Role-based access control for sensitive incidents
- **Data Classification**: Automatic classification of shared content
## API Endpoints
### War Rooms
#### List War Rooms
```http
GET /api/collaboration_war_rooms/api/war-rooms/
```
#### Get War Room Details
```http
GET /api/collaboration_war_rooms/api/war-rooms/{id}/
```
#### Create Chat Room for Incident
```http
POST /api/collaboration_war_rooms/api/war-rooms/{incident_id}/create_chat_room/
```
#### Get War Room Messages
```http
GET /api/collaboration_war_rooms/api/war-rooms/{id}/messages/
```
#### Get Pinned Messages
```http
GET /api/collaboration_war_rooms/api/war-rooms/{id}/pinned_messages/
```
### Messages
#### Send Message
```http
POST /api/collaboration_war_rooms/api/war-room-messages/
Content-Type: application/json
{
"war_room_id": "uuid",
"content": "Message content",
"message_type": "TEXT",
"mentioned_user_ids": ["user-uuid-1", "user-uuid-2"]
}
```
#### Pin Message
```http
POST /api/collaboration_war_rooms/api/war-room-messages/{id}/pin_message/
```
#### Unpin Message
```http
POST /api/collaboration_war_rooms/api/war-room-messages/{id}/unpin_message/
```
#### Add Reaction
```http
POST /api/collaboration_war_rooms/api/war-room-messages/{id}/add_reaction/
Content-Type: application/json
{
"emoji": "👍"
}
```
#### Remove Reaction
```http
POST /api/collaboration_war_rooms/api/war-room-messages/{id}/remove_reaction/
Content-Type: application/json
{
"emoji": "👍"
}
```
#### Execute ChatOps Command
```http
POST /api/collaboration_war_rooms/api/war-room-messages/{id}/execute_command/
Content-Type: application/json
{
"command_text": "/status"
}
```
### File Management
#### Upload File
```http
POST /api/collaboration_war_rooms/api/chat-files/
Content-Type: multipart/form-data
{
"message": "message-uuid",
"file": "file-data",
"file_type": "SCREENSHOT"
}
```
#### Log File Access
```http
POST /api/collaboration_war_rooms/api/chat-files/{id}/log_access/
```
### Chat Bots
#### List Chat Bots
```http
GET /api/collaboration_war_rooms/api/chat-bots/
```
#### Generate AI Response
```http
POST /api/collaboration_war_rooms/api/chat-bots/{id}/generate_response/
Content-Type: application/json
{
"message_id": "message-uuid",
"context": {}
}
```
## WebSocket API
### Connection
```javascript
const ws = new WebSocket('ws://localhost:8000/ws/chat/{room_id}/');
```
### Message Types
#### Send Chat Message
```javascript
ws.send(JSON.stringify({
type: 'chat_message',
content: 'Hello team!',
message_type: 'TEXT',
reply_to_id: 'optional-message-id'
}));
```
#### Add Reaction
```javascript
ws.send(JSON.stringify({
type: 'reaction',
message_id: 'message-uuid',
emoji: '👍',
action: 'add' // or 'remove'
}));
```
#### Execute Command
```javascript
ws.send(JSON.stringify({
type: 'command',
message_id: 'message-uuid',
command_text: '/status'
}));
```
#### Typing Indicator
```javascript
ws.send(JSON.stringify({
type: 'typing',
is_typing: true
}));
```
### Receive Messages
#### Chat Message
```javascript
ws.onmessage = function(event) {
const data = JSON.parse(event.data);
if (data.type === 'chat_message') {
// Handle new message
console.log('New message:', data.data);
}
};
```
#### Reaction Update
```javascript
if (data.type === 'reaction_update') {
// Handle reaction update
console.log('Reaction update:', data.data);
}
```
#### Command Result
```javascript
if (data.type === 'command_result') {
// Handle command execution result
console.log('Command result:', data.data);
}
```
## ChatOps Commands
### Available Commands
#### Status Check
```
/status
```
Returns current incident status, severity, assignee, and timestamps.
#### Runbook Execution
```
/run playbook <playbook-name>
```
Executes a runbook for the current incident.
#### Escalation
```
/escalate [reason]
```
Triggers escalation procedures for the incident.
#### Assignment
```
/assign <username>
```
Assigns the incident to a specific user.
#### Status Update
```
/update status <new-status>
```
Updates the incident status.
### Command Response Format
```json
{
"command_type": "STATUS",
"execution_status": "SUCCESS",
"execution_result": {
"incident_id": "uuid",
"title": "Incident Title",
"status": "IN_PROGRESS",
"severity": "HIGH",
"assigned_to": "username",
"created_at": "2024-01-01T00:00:00Z",
"updated_at": "2024-01-01T00:00:00Z"
}
}
```
## Integration Points
### Incident Intelligence
- Auto-creates chat rooms when incidents are created
- Links chat messages to incident timeline
- Updates incident status via ChatOps commands
### SLA & On-Call
- Sends notifications when SLA thresholds are hit
- Integrates with escalation procedures
- Notifies on-call teams of critical updates
### Automation Orchestration
- Executes runbooks via chat commands
- Triggers auto-remediation procedures
- Provides status updates on automation execution
### Compliance & Governance
- Classifies files automatically
- Maintains audit trails for all chat activity
- Enforces data retention policies
### Security
- Encrypts sensitive messages and files
- Enforces RBAC for incident access
- Logs all security-relevant activities
### Knowledge Learning
- AI assistant provides contextual help
- Suggests similar past incidents
- Learns from chat interactions
## Security Considerations
### Access Control
- Users must have appropriate clearance level for sensitive incidents
- War room access is controlled by incident permissions
- File access is logged and audited
### Encryption
- Messages can be encrypted for sensitive incidents
- Files are encrypted based on classification level
- WebSocket connections use WSS in production
### Audit Trail
- All chat messages are logged with timestamps
- File access is tracked with user and timestamp
- Command executions are logged with results
## Best Practices
### Message Organization
- Use pinned messages for important updates
- Use reactions for quick feedback
- Use threaded replies for focused discussions
### File Management
- Classify files appropriately
- Use descriptive filenames
- Clean up temporary files regularly
### Command Usage
- Use commands for automation, not manual updates
- Verify command results before proceeding
- Document custom commands for team use
### Security
- Be mindful of sensitive information in chat
- Use appropriate classification levels
- Report security incidents immediately
## Error Handling
### Common Error Responses
#### Access Denied
```json
{
"error": "You do not have permission to access this war room"
}
```
#### Invalid Command
```json
{
"error": "Unknown command type"
}
```
#### File Upload Error
```json
{
"error": "File size exceeds limit"
}
```
### WebSocket Errors
```json
{
"type": "error",
"message": "Authentication required"
}
```
## Rate Limiting
- Message sending: 60 messages per minute per user
- File uploads: 10 files per minute per user
- Command execution: 20 commands per minute per user
- WebSocket connections: 5 concurrent connections per user
## Monitoring & Analytics
### Metrics Tracked
- Message volume per incident
- Response times for commands
- File upload/download statistics
- User engagement metrics
- Error rates and types
### Alerts
- High message volume incidents
- Failed command executions
- Security policy violations
- System performance issues
## Future Enhancements
### Planned Features
- Voice messages and video calls
- Advanced AI assistant capabilities
- Integration with external chat platforms
- Mobile app support
- Advanced analytics dashboard
### Integration Roadmap
- Slack/Teams integration
- PagerDuty integration
- Jira integration
- Custom webhook support

View File

@@ -0,0 +1,240 @@
# Incident-Centric Chat System
A comprehensive real-time collaboration platform for incident response teams, integrated with the ETB (Enterprise Incident Management) API.
## 🚀 Features
### Core Chat Functionality
- **Real-time Messaging**: WebSocket-based chat with instant message delivery
- **Incident-Centric Rooms**: Every incident automatically gets its own chat room
- **Cross-linking**: Direct links between incident timeline and chat logs
- **Pinned Messages**: Pin important updates for easy reference
- **Threaded Conversations**: Reply to messages for focused discussions
- **Reactions**: Emoji reactions (👍, 🚨, ✅) for lightweight feedback
### Advanced Collaboration
- **@mentions**: Mention users with notifications
- **File Sharing**: Upload logs, screenshots, evidence files
- **ChatOps Commands**: Execute automation commands via chat
- **AI Assistant**: Intelligent bot for incident guidance and knowledge queries
### Security & Compliance
- **Encryption**: Chat logs encrypted at rest and in transit
- **RBAC**: Role-based access control for sensitive incidents
- **Audit Trail**: Immutable audit trail for compliance
- **Data Classification**: Automatic file classification and retention
### Integrations
- **Incident Intelligence**: Auto-creates chat rooms, links to timeline
- **SLA & On-Call**: SLA threshold notifications, escalation alerts
- **Automation Orchestration**: Execute runbooks via chat commands
- **Compliance Governance**: File classification, audit trails
- **Knowledge Learning**: AI assistant with knowledge base integration
## 📋 Quick Start
### 1. Setup
```bash
# Activate virtual environment
source venv/bin/activate.fish
# Run migrations
python manage.py migrate
# Create default bots and war rooms
python manage.py setup_chat_system --create-bots --create-war-rooms
```
### 2. WebSocket Connection
```javascript
// Connect to chat room
const ws = new WebSocket('ws://localhost:8000/ws/chat/{room_id}/');
// Send message
ws.send(JSON.stringify({
type: 'chat_message',
content: 'Hello team!',
message_type: 'TEXT'
}));
// Add reaction
ws.send(JSON.stringify({
type: 'reaction',
message_id: 'message-uuid',
emoji: '👍',
action: 'add'
}));
```
### 3. ChatOps Commands
```
/status # Get incident status
/run playbook <name> # Execute runbook
/escalate [reason] # Trigger escalation
/assign <username> # Assign incident
/update status <status> # Update incident status
```
## 🏗️ Architecture
### Models
- **WarRoom**: Chat rooms for incidents
- **WarRoomMessage**: Chat messages with reactions, attachments
- **MessageReaction**: Emoji reactions to messages
- **ChatFile**: File attachments with compliance integration
- **ChatCommand**: ChatOps command execution
- **ChatBot**: AI assistant bots
### Services
- **SLANotificationService**: SLA threshold and escalation notifications
- **AutomationCommandService**: ChatOps command execution
- **ComplianceIntegrationService**: File classification and audit trails
- **AIAssistantService**: AI-powered assistance and suggestions
### WebSocket Consumer
- **ChatConsumer**: Real-time chat functionality
- Message broadcasting
- Reaction handling
- Command execution
- Typing indicators
## 🔧 API Endpoints
### War Rooms
```http
GET /api/collaboration_war_rooms/api/war-rooms/
POST /api/collaboration_war_rooms/api/war-rooms/{id}/create_chat_room/
GET /api/collaboration_war_rooms/api/war-rooms/{id}/messages/
GET /api/collaboration_war_rooms/api/war-rooms/{id}/pinned_messages/
```
### Messages
```http
POST /api/collaboration_war_rooms/api/war-room-messages/
POST /api/collaboration_war_rooms/api/war-room-messages/{id}/pin_message/
POST /api/collaboration_war_rooms/api/war-room-messages/{id}/add_reaction/
POST /api/collaboration_war_rooms/api/war-room-messages/{id}/execute_command/
```
### Files
```http
POST /api/collaboration_war_rooms/api/chat-files/
POST /api/collaboration_war_rooms/api/chat-files/{id}/log_access/
```
### AI Assistant
```http
GET /api/collaboration_war_rooms/api/chat-bots/
POST /api/collaboration_war_rooms/api/chat-bots/{id}/generate_response/
```
## 🔐 Security Features
### Access Control
- Users must have appropriate clearance level for sensitive incidents
- War room access controlled by incident permissions
- File access logged and audited
### Encryption
- Messages can be encrypted for sensitive incidents
- Files encrypted based on classification level
- WebSocket connections use WSS in production
### Audit Trail
- All chat messages logged with timestamps
- File access tracked with user and timestamp
- Command executions logged with results
## 📊 Monitoring & Analytics
### Metrics Tracked
- Message volume per incident
- Response times for commands
- File upload/download statistics
- User engagement metrics
- Error rates and types
### Alerts
- High message volume incidents
- Failed command executions
- Security policy violations
- System performance issues
## 🚀 Deployment
### Production Setup
1. Configure WebSocket routing in your ASGI application
2. Set up Redis for WebSocket channel layers
3. Configure file storage for attachments
4. Set up SSL certificates for WSS connections
5. Configure monitoring and alerting
### Environment Variables
```bash
# WebSocket configuration
CHANNEL_LAYERS_REDIS_URL=redis://localhost:6379/1
# File storage
DEFAULT_FILE_STORAGE=django.core.files.storage.FileSystemStorage
MEDIA_ROOT=/var/www/media/
# Security
CHAT_ENCRYPTION_KEY=your-encryption-key
CHAT_AUDIT_LOG_LEVEL=INFO
```
## 🧪 Testing
### Unit Tests
```bash
python manage.py test collaboration_war_rooms
```
### WebSocket Testing
```javascript
// Test WebSocket connection
const ws = new WebSocket('ws://localhost:8000/ws/chat/test-room/');
ws.onopen = () => console.log('Connected');
ws.onmessage = (event) => console.log('Message:', event.data);
```
## 📚 Documentation
- [API Documentation](Documentations/INCIDENT_CENTRIC_CHAT_API.md)
- [WebSocket API Reference](Documentations/INCIDENT_CENTRIC_CHAT_API.md#websocket-api)
- [ChatOps Commands](Documentations/INCIDENT_CENTRIC_CHAT_API.md#chatops-commands)
- [Security Guidelines](Documentations/INCIDENT_CENTRIC_CHAT_API.md#security-considerations)
## 🤝 Contributing
1. Fork the repository
2. Create a feature branch
3. Make your changes
4. Add tests
5. Submit a pull request
## 📄 License
This project is part of the ETB (Enterprise Incident Management) API system.
## 🆘 Support
For support and questions:
- Check the documentation
- Review the API reference
- Contact the development team
## 🔮 Roadmap
### Planned Features
- Voice messages and video calls
- Advanced AI assistant capabilities
- Integration with external chat platforms
- Mobile app support
- Advanced analytics dashboard
### Integration Roadmap
- Slack/Teams integration
- PagerDuty integration
- Jira integration
- Custom webhook support

View File

@@ -0,0 +1,279 @@
"""
Admin configuration for Collaboration & War Rooms module
"""
from django.contrib import admin
from django.utils.html import format_html
from .models import (
WarRoom, ConferenceBridge, IncidentCommandRole,
TimelineEvent, WarRoomMessage, IncidentDecision
)
@admin.register(WarRoom)
class WarRoomAdmin(admin.ModelAdmin):
"""Admin interface for WarRoom model"""
list_display = [
'name', 'incident_title', 'status', 'privacy_level',
'message_count', 'active_participants', 'created_at'
]
list_filter = ['status', 'privacy_level', 'created_at']
search_fields = ['name', 'description', 'incident__title']
readonly_fields = ['id', 'created_at', 'updated_at', 'message_count', 'last_activity']
fieldsets = (
('Basic Information', {
'fields': ('id', 'name', 'description', 'incident')
}),
('Configuration', {
'fields': ('status', 'privacy_level', 'required_clearance_level')
}),
('Integrations', {
'fields': ('slack_channel_id', 'teams_channel_id', 'discord_channel_id'),
'classes': ('collapse',)
}),
('Access Control', {
'fields': ('allowed_users',),
'classes': ('collapse',)
}),
('Activity Tracking', {
'fields': ('message_count', 'last_activity', 'active_participants'),
'classes': ('collapse',)
}),
('Metadata', {
'fields': ('created_by', 'created_at', 'updated_at', 'archived_at'),
'classes': ('collapse',)
}),
)
def incident_title(self, obj):
"""Display incident title"""
return obj.incident.title
incident_title.short_description = 'Incident'
@admin.register(ConferenceBridge)
class ConferenceBridgeAdmin(admin.ModelAdmin):
"""Admin interface for ConferenceBridge model"""
list_display = [
'name', 'incident_title', 'bridge_type', 'status',
'scheduled_start', 'participant_count'
]
list_filter = ['bridge_type', 'status', 'recording_enabled', 'transcription_enabled']
search_fields = ['name', 'description', 'incident__title']
readonly_fields = ['id', 'created_at', 'updated_at', 'actual_start', 'actual_end']
fieldsets = (
('Basic Information', {
'fields': ('id', 'name', 'description', 'incident', 'war_room')
}),
('Bridge Configuration', {
'fields': ('bridge_type', 'status', 'integration_config')
}),
('Meeting Details', {
'fields': ('meeting_id', 'meeting_url', 'dial_in_number', 'access_code')
}),
('Schedule', {
'fields': ('scheduled_start', 'scheduled_end', 'actual_start', 'actual_end')
}),
('Participants', {
'fields': ('invited_participants', 'active_participants', 'max_participants'),
'classes': ('collapse',)
}),
('Recording & Transcription', {
'fields': ('recording_enabled', 'recording_url', 'transcription_enabled', 'transcription_url'),
'classes': ('collapse',)
}),
('Metadata', {
'fields': ('created_by', 'created_at', 'updated_at'),
'classes': ('collapse',)
}),
)
def incident_title(self, obj):
"""Display incident title"""
return obj.incident.title
incident_title.short_description = 'Incident'
def participant_count(self, obj):
"""Display participant count"""
return obj.invited_participants.count()
participant_count.short_description = 'Participants'
@admin.register(IncidentCommandRole)
class IncidentCommandRoleAdmin(admin.ModelAdmin):
"""Admin interface for IncidentCommandRole model"""
list_display = [
'role_type', 'incident_title', 'assigned_user', 'status',
'decisions_made', 'assigned_at'
]
list_filter = ['role_type', 'status', 'assigned_at']
search_fields = ['incident__title', 'assigned_user__username']
readonly_fields = [
'id', 'assigned_at', 'reassigned_at', 'reassigned_by',
'decisions_made', 'communications_sent', 'last_activity',
'created_at', 'updated_at'
]
fieldsets = (
('Basic Information', {
'fields': ('id', 'incident', 'war_room', 'role_type', 'assigned_user', 'status')
}),
('Role Configuration', {
'fields': ('responsibilities', 'decision_authority'),
'classes': ('collapse',)
}),
('Assignment Tracking', {
'fields': ('assigned_at', 'reassigned_at', 'reassigned_by', 'assignment_notes'),
'classes': ('collapse',)
}),
('Performance Tracking', {
'fields': ('decisions_made', 'communications_sent', 'last_activity'),
'classes': ('collapse',)
}),
('Metadata', {
'fields': ('created_by', 'created_at', 'updated_at'),
'classes': ('collapse',)
}),
)
def incident_title(self, obj):
"""Display incident title"""
return obj.incident.title
incident_title.short_description = 'Incident'
@admin.register(TimelineEvent)
class TimelineEventAdmin(admin.ModelAdmin):
"""Admin interface for TimelineEvent model"""
list_display = [
'event_time', 'title', 'incident_title', 'event_type',
'source_type', 'is_critical_event'
]
list_filter = ['event_type', 'source_type', 'is_critical_event', 'event_time']
search_fields = ['title', 'description', 'incident__title']
readonly_fields = ['id', 'created_at']
fieldsets = (
('Basic Information', {
'fields': ('id', 'incident', 'event_type', 'title', 'description', 'source_type')
}),
('Timing', {
'fields': ('event_time', 'created_at')
}),
('Related Objects', {
'fields': (
'related_user', 'related_runbook_execution', 'related_auto_remediation',
'related_sla_instance', 'related_escalation', 'related_war_room',
'related_conference', 'related_command_role'
),
'classes': ('collapse',)
}),
('Event Data', {
'fields': ('event_data', 'tags'),
'classes': ('collapse',)
}),
('Postmortem', {
'fields': ('is_critical_event', 'postmortem_notes'),
'classes': ('collapse',)
}),
('Metadata', {
'fields': ('created_by',),
'classes': ('collapse',)
}),
)
def incident_title(self, obj):
"""Display incident title"""
return obj.incident.title
incident_title.short_description = 'Incident'
@admin.register(WarRoomMessage)
class WarRoomMessageAdmin(admin.ModelAdmin):
"""Admin interface for WarRoomMessage model"""
list_display = [
'sender_name', 'war_room_name', 'message_type', 'content_preview', 'created_at'
]
list_filter = ['message_type', 'is_edited', 'created_at']
search_fields = ['content', 'sender_name', 'war_room__name']
readonly_fields = ['id', 'created_at', 'updated_at', 'is_edited', 'edited_at']
fieldsets = (
('Basic Information', {
'fields': ('id', 'war_room', 'message_type', 'content')
}),
('Sender Information', {
'fields': ('sender', 'sender_name')
}),
('Message Metadata', {
'fields': ('is_edited', 'edited_at', 'reply_to'),
'classes': ('collapse',)
}),
('Integration Data', {
'fields': ('external_message_id', 'external_data'),
'classes': ('collapse',)
}),
('Timestamps', {
'fields': ('created_at', 'updated_at'),
'classes': ('collapse',)
}),
)
def war_room_name(self, obj):
"""Display war room name"""
return obj.war_room.name
war_room_name.short_description = 'War Room'
def content_preview(self, obj):
"""Display content preview"""
return obj.content[:50] + '...' if len(obj.content) > 50 else obj.content
content_preview.short_description = 'Content Preview'
@admin.register(IncidentDecision)
class IncidentDecisionAdmin(admin.ModelAdmin):
"""Admin interface for IncidentDecision model"""
list_display = [
'title', 'incident_title', 'decision_type', 'status',
'approved_by', 'implemented_by', 'created_at'
]
list_filter = ['decision_type', 'status', 'requires_approval', 'created_at']
search_fields = ['title', 'description', 'incident__title']
readonly_fields = [
'id', 'approved_by', 'approved_at', 'implemented_at',
'implemented_by', 'created_at', 'updated_at'
]
fieldsets = (
('Basic Information', {
'fields': ('id', 'incident', 'command_role', 'decision_type', 'title', 'description', 'rationale')
}),
('Decision Status', {
'fields': ('status', 'requires_approval', 'approved_by', 'approved_at')
}),
('Implementation', {
'fields': ('implementation_notes', 'implemented_at', 'implemented_by'),
'classes': ('collapse',)
}),
('Impact Tracking', {
'fields': ('impact_assessment', 'success_metrics'),
'classes': ('collapse',)
}),
('Timestamps', {
'fields': ('created_at', 'updated_at'),
'classes': ('collapse',)
}),
)
def incident_title(self, obj):
"""Display incident title"""
return obj.incident.title
incident_title.short_description = 'Incident'

View File

@@ -0,0 +1,11 @@
from django.apps import AppConfig
class CollaborationWarRoomsConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'collaboration_war_rooms'
verbose_name = 'Collaboration & War Rooms'
def ready(self):
"""Import signal handlers when the app is ready"""
import collaboration_war_rooms.signals

View File

@@ -0,0 +1,414 @@
"""
WebSocket consumers for real-time chat functionality
"""
import json
import uuid
from channels.generic.websocket import AsyncWebsocketConsumer
from channels.db import database_sync_to_async
from django.contrib.auth import get_user_model
from django.utils import timezone
from .models import WarRoom, WarRoomMessage, MessageReaction, ChatCommand
User = get_user_model()
class ChatConsumer(AsyncWebsocketConsumer):
"""WebSocket consumer for real-time chat in war rooms"""
async def connect(self):
"""Connect to WebSocket"""
self.room_id = self.scope['url_route']['kwargs']['room_id']
self.room_group_name = f'chat_{self.room_id}'
# Join room group
await self.channel_layer.group_add(
self.room_group_name,
self.channel_name
)
await self.accept()
# Send room info
room_info = await self.get_room_info()
await self.send(text_data=json.dumps({
'type': 'room_info',
'data': room_info
}))
async def disconnect(self, close_code):
"""Disconnect from WebSocket"""
# Leave room group
await self.channel_layer.group_discard(
self.room_group_name,
self.channel_name
)
async def receive(self, text_data):
"""Receive message from WebSocket"""
try:
data = json.loads(text_data)
message_type = data.get('type')
if message_type == 'chat_message':
await self.handle_chat_message(data)
elif message_type == 'reaction':
await self.handle_reaction(data)
elif message_type == 'command':
await self.handle_command(data)
elif message_type == 'typing':
await self.handle_typing(data)
else:
await self.send(text_data=json.dumps({
'type': 'error',
'message': 'Unknown message type'
}))
except json.JSONDecodeError:
await self.send(text_data=json.dumps({
'type': 'error',
'message': 'Invalid JSON'
}))
except Exception as e:
await self.send(text_data=json.dumps({
'type': 'error',
'message': str(e)
}))
async def handle_chat_message(self, data):
"""Handle chat message"""
content = data.get('content', '').strip()
message_type = data.get('message_type', 'TEXT')
reply_to_id = data.get('reply_to_id')
if not content:
await self.send(text_data=json.dumps({
'type': 'error',
'message': 'Message content cannot be empty'
}))
return
# Create message
message = await self.create_message(content, message_type, reply_to_id)
if message:
# Send message to room group
await self.channel_layer.group_send(
self.room_group_name,
{
'type': 'chat_message',
'message': await self.serialize_message(message)
}
)
# Check for mentions and send notifications
await self.handle_mentions(message)
# Check for commands
await self.check_for_commands(message)
async def handle_reaction(self, data):
"""Handle message reaction"""
message_id = data.get('message_id')
emoji = data.get('emoji')
action = data.get('action', 'add') # 'add' or 'remove'
if not message_id or not emoji:
await self.send(text_data=json.dumps({
'type': 'error',
'message': 'Message ID and emoji are required'
}))
return
# Handle reaction
if action == 'add':
reaction = await self.add_reaction(message_id, emoji)
else:
reaction = await self.remove_reaction(message_id, emoji)
if reaction is not False:
# Send reaction update to room group
await self.channel_layer.group_send(
self.room_group_name,
{
'type': 'reaction_update',
'message_id': message_id,
'reaction': await self.serialize_reaction(reaction) if reaction else None,
'action': action
}
)
async def handle_command(self, data):
"""Handle ChatOps command"""
message_id = data.get('message_id')
command_text = data.get('command_text')
if not message_id or not command_text:
await self.send(text_data=json.dumps({
'type': 'error',
'message': 'Message ID and command text are required'
}))
return
# Execute command
result = await self.execute_command(message_id, command_text)
# Send command result to room group
await self.channel_layer.group_send(
self.room_group_name,
{
'type': 'command_result',
'message_id': message_id,
'result': result
}
)
async def handle_typing(self, data):
"""Handle typing indicator"""
is_typing = data.get('is_typing', False)
# Send typing indicator to room group
await self.channel_layer.group_send(
self.room_group_name,
{
'type': 'typing_indicator',
'user': await self.get_user_info(),
'is_typing': is_typing
}
)
async def chat_message(self, event):
"""Send chat message to WebSocket"""
await self.send(text_data=json.dumps({
'type': 'chat_message',
'data': event['message']
}))
async def reaction_update(self, event):
"""Send reaction update to WebSocket"""
await self.send(text_data=json.dumps({
'type': 'reaction_update',
'data': {
'message_id': event['message_id'],
'reaction': event['reaction'],
'action': event['action']
}
}))
async def command_result(self, event):
"""Send command result to WebSocket"""
await self.send(text_data=json.dumps({
'type': 'command_result',
'data': {
'message_id': event['message_id'],
'result': event['result']
}
}))
async def typing_indicator(self, event):
"""Send typing indicator to WebSocket"""
await self.send(text_data=json.dumps({
'type': 'typing_indicator',
'data': {
'user': event['user'],
'is_typing': event['is_typing']
}
}))
@database_sync_to_async
def get_room_info(self):
"""Get room information"""
try:
room = WarRoom.objects.get(id=self.room_id)
return {
'id': str(room.id),
'name': room.name,
'incident_id': str(room.incident.id),
'incident_title': room.incident.title,
'participant_count': room.allowed_users.count(),
'message_count': room.message_count
}
except WarRoom.DoesNotExist:
return None
@database_sync_to_async
def get_user_info(self):
"""Get current user information"""
user = self.scope['user']
if user.is_authenticated:
return {
'id': str(user.id),
'username': user.username,
'display_name': getattr(user, 'display_name', user.username)
}
return None
@database_sync_to_async
def create_message(self, content, message_type, reply_to_id=None):
"""Create a new message"""
try:
room = WarRoom.objects.get(id=self.room_id)
user = self.scope['user']
if not user.is_authenticated:
return None
# Check if user has access to room
if not room.can_user_access(user):
return None
reply_to = None
if reply_to_id:
try:
reply_to = WarRoomMessage.objects.get(id=reply_to_id)
except WarRoomMessage.DoesNotExist:
pass
message = WarRoomMessage.objects.create(
war_room=room,
content=content,
message_type=message_type,
sender=user,
sender_name=user.username,
reply_to=reply_to
)
# Update room message count
room.message_count += 1
room.last_activity = timezone.now()
room.save(update_fields=['message_count', 'last_activity'])
return message
except WarRoom.DoesNotExist:
return None
@database_sync_to_async
def serialize_message(self, message):
"""Serialize message for WebSocket"""
return {
'id': str(message.id),
'content': message.content,
'message_type': message.message_type,
'sender': {
'id': str(message.sender.id) if message.sender else None,
'username': message.sender.username if message.sender else None,
'display_name': message.sender_name
},
'is_pinned': message.is_pinned,
'reply_to_id': str(message.reply_to.id) if message.reply_to else None,
'created_at': message.created_at.isoformat(),
'reactions': list(message.get_reactions_summary())
}
@database_sync_to_async
def add_reaction(self, message_id, emoji):
"""Add reaction to message"""
try:
message = WarRoomMessage.objects.get(id=message_id)
user = self.scope['user']
if not user.is_authenticated:
return False
reaction = message.add_reaction(user, emoji)
return reaction
except WarRoomMessage.DoesNotExist:
return False
@database_sync_to_async
def remove_reaction(self, message_id, emoji):
"""Remove reaction from message"""
try:
message = WarRoomMessage.objects.get(id=message_id)
user = self.scope['user']
if not user.is_authenticated:
return False
message.remove_reaction(user, emoji)
return True
except WarRoomMessage.DoesNotExist:
return False
@database_sync_to_async
def serialize_reaction(self, reaction):
"""Serialize reaction for WebSocket"""
return {
'id': str(reaction.id),
'emoji': reaction.emoji,
'user': {
'id': str(reaction.user.id),
'username': reaction.user.username
},
'created_at': reaction.created_at.isoformat()
}
@database_sync_to_async
def execute_command(self, message_id, command_text):
"""Execute ChatOps command"""
try:
message = WarRoomMessage.objects.get(id=message_id)
user = self.scope['user']
if not user.is_authenticated:
return {'error': 'Authentication required'}
# Parse command
command_type = self._parse_command_type(command_text)
parameters = self._parse_command_parameters(command_text)
# Create chat command
chat_command = ChatCommand.objects.create(
message=message,
command_type=command_type,
command_text=command_text,
parameters=parameters
)
# Execute command
result = chat_command.execute_command(user)
return result
except WarRoomMessage.DoesNotExist:
return {'error': 'Message not found'}
def _parse_command_type(self, command_text):
"""Parse command type from command text"""
command_text = command_text.lower().strip()
if command_text.startswith('/status'):
return 'STATUS'
elif command_text.startswith('/runbook'):
return 'RUNBOOK'
elif command_text.startswith('/escalate'):
return 'ESCALATE'
elif command_text.startswith('/assign'):
return 'ASSIGN'
elif command_text.startswith('/update'):
return 'UPDATE'
else:
return 'CUSTOM'
def _parse_command_parameters(self, command_text):
"""Parse command parameters from command text"""
parts = command_text.split()
if len(parts) > 1:
return {'args': parts[1:]}
return {}
@database_sync_to_async
def handle_mentions(self, message):
"""Handle user mentions in message"""
# This would integrate with notification system
# For now, just a placeholder
pass
@database_sync_to_async
def check_for_commands(self, message):
"""Check if message contains commands"""
# This would check for command patterns and execute them
# For now, just a placeholder
pass

View File

@@ -0,0 +1,186 @@
"""
Management command to set up the incident-centric chat system
"""
from django.core.management.base import BaseCommand
from django.contrib.auth import get_user_model
from django.utils import timezone
from ...models import ChatBot, WarRoom
from incident_intelligence.models import Incident
User = get_user_model()
class Command(BaseCommand):
help = 'Set up the incident-centric chat system with default bots and configurations'
def add_arguments(self, parser):
parser.add_argument(
'--create-bots',
action='store_true',
help='Create default AI assistant bots',
)
parser.add_argument(
'--create-war-rooms',
action='store_true',
help='Create war rooms for existing incidents',
)
parser.add_argument(
'--force',
action='store_true',
help='Force recreation of existing bots and war rooms',
)
def handle(self, *args, **options):
self.stdout.write(
self.style.SUCCESS('Setting up incident-centric chat system...')
)
if options['create_bots']:
self.create_default_bots(options['force'])
if options['create_war_rooms']:
self.create_war_rooms_for_incidents(options['force'])
self.stdout.write(
self.style.SUCCESS('Chat system setup completed successfully!')
)
def create_default_bots(self, force=False):
"""Create default AI assistant bots"""
self.stdout.write('Creating default AI assistant bots...')
bots_config = [
{
'name': 'Incident Assistant',
'bot_type': 'INCIDENT_ASSISTANT',
'description': 'AI assistant for incident management and response guidance',
'auto_respond': True,
'response_triggers': ['help', 'assist', 'guidance', 'incident', 'problem']
},
{
'name': 'Knowledge Bot',
'bot_type': 'KNOWLEDGE_BOT',
'description': 'AI assistant for knowledge base queries and documentation',
'auto_respond': False,
'response_triggers': ['how', 'what', 'where', 'documentation', 'knowledge']
},
{
'name': 'Automation Bot',
'bot_type': 'AUTOMATION_BOT',
'description': 'AI assistant for automation and runbook execution',
'auto_respond': False,
'response_triggers': ['runbook', 'automation', 'execute', 'playbook']
},
{
'name': 'Compliance Bot',
'bot_type': 'COMPLIANCE_BOT',
'description': 'AI assistant for compliance and audit requirements',
'auto_respond': False,
'response_triggers': ['compliance', 'audit', 'policy', 'retention']
}
]
for bot_config in bots_config:
bot, created = ChatBot.objects.get_or_create(
name=bot_config['name'],
defaults={
'bot_type': bot_config['bot_type'],
'description': bot_config['description'],
'is_active': True,
'auto_respond': bot_config['auto_respond'],
'response_triggers': bot_config['response_triggers']
}
)
if created:
self.stdout.write(
self.style.SUCCESS(f'Created bot: {bot.name}')
)
elif force:
bot.bot_type = bot_config['bot_type']
bot.description = bot_config['description']
bot.auto_respond = bot_config['auto_respond']
bot.response_triggers = bot_config['response_triggers']
bot.save()
self.stdout.write(
self.style.WARNING(f'Updated bot: {bot.name}')
)
else:
self.stdout.write(
self.style.WARNING(f'Bot already exists: {bot.name}')
)
def create_war_rooms_for_incidents(self, force=False):
"""Create war rooms for existing incidents"""
self.stdout.write('Creating war rooms for existing incidents...')
incidents = Incident.objects.all()
created_count = 0
updated_count = 0
for incident in incidents:
war_room, created = WarRoom.objects.get_or_create(
incident=incident,
defaults={
'name': f"Incident {incident.id} - {incident.title[:50]}",
'description': f"War room for incident: {incident.title}",
'created_by': incident.reporter,
'privacy_level': 'PRIVATE',
'status': 'ACTIVE'
}
)
if created:
# Add incident reporter and assignee to war room
if incident.reporter:
war_room.add_participant(incident.reporter)
if incident.assigned_to:
war_room.add_participant(incident.assigned_to)
created_count += 1
self.stdout.write(
self.style.SUCCESS(f'Created war room for incident: {incident.title}')
)
elif force:
war_room.name = f"Incident {incident.id} - {incident.title[:50]}"
war_room.description = f"War room for incident: {incident.title}"
war_room.save()
updated_count += 1
self.stdout.write(
self.style.WARNING(f'Updated war room for incident: {incident.title}')
)
self.stdout.write(
self.style.SUCCESS(
f'Created {created_count} new war rooms, updated {updated_count} existing war rooms'
)
)
def create_sample_data(self):
"""Create sample data for testing"""
self.stdout.write('Creating sample data...')
# Create a sample incident if none exist
if not Incident.objects.exists():
sample_incident = Incident.objects.create(
title='Sample Database Connection Issue',
description='Database connection timeout affecting user authentication',
severity='HIGH',
status='OPEN',
category='Database',
subcategory='Connection'
)
# Create war room for sample incident
WarRoom.objects.create(
incident=sample_incident,
name=f"Incident {sample_incident.id} - {sample_incident.title}",
description=f"War room for incident: {sample_incident.title}",
privacy_level='PRIVATE',
status='ACTIVE'
)
self.stdout.write(
self.style.SUCCESS('Created sample incident and war room')
)

View File

@@ -0,0 +1,261 @@
# Generated by Django 5.2.6 on 2025-09-18 16:26
import django.db.models.deletion
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
('automation_orchestration', '0002_autoremediationexecution_sla_instance_and_more'),
('incident_intelligence', '0004_incident_oncall_assignment_incident_sla_override_and_more'),
('security', '0002_user_emergency_contact_user_oncall_preferences_and_more'),
('sla_oncall', '0001_initial'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='ConferenceBridge',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200)),
('description', models.TextField(blank=True, null=True)),
('bridge_type', models.CharField(choices=[('ZOOM', 'Zoom'), ('TEAMS', 'Microsoft Teams'), ('WEBEX', 'Cisco Webex'), ('GOTO_MEETING', 'GoTo Meeting'), ('CUSTOM', 'Custom Bridge')], max_length=20)),
('status', models.CharField(choices=[('SCHEDULED', 'Scheduled'), ('ACTIVE', 'Active'), ('ENDED', 'Ended'), ('CANCELLED', 'Cancelled')], default='SCHEDULED', max_length=20)),
('meeting_id', models.CharField(blank=True, help_text='External meeting ID', max_length=255, null=True)),
('meeting_url', models.URLField(blank=True, help_text='Meeting URL', null=True)),
('dial_in_number', models.CharField(blank=True, help_text='Dial-in phone number', max_length=50, null=True)),
('access_code', models.CharField(blank=True, help_text='Access code for dial-in', max_length=20, null=True)),
('scheduled_start', models.DateTimeField(help_text='Scheduled start time')),
('scheduled_end', models.DateTimeField(help_text='Scheduled end time')),
('actual_start', models.DateTimeField(blank=True, null=True)),
('actual_end', models.DateTimeField(blank=True, null=True)),
('max_participants', models.PositiveIntegerField(default=50)),
('recording_enabled', models.BooleanField(default=False)),
('recording_url', models.URLField(blank=True, help_text='URL to recorded meeting', null=True)),
('transcription_enabled', models.BooleanField(default=False)),
('transcription_url', models.URLField(blank=True, help_text='URL to meeting transcription', null=True)),
('integration_config', models.JSONField(default=dict, help_text='Configuration for external bridge integration')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('active_participants', models.ManyToManyField(blank=True, help_text='Users currently in the conference', related_name='active_conferences', to=settings.AUTH_USER_MODEL)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_conferences', to=settings.AUTH_USER_MODEL)),
('incident', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='conference_bridges', to='incident_intelligence.incident')),
('invited_participants', models.ManyToManyField(blank=True, help_text='Users invited to the conference', related_name='invited_conferences', to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['-scheduled_start'],
},
),
migrations.CreateModel(
name='IncidentCommandRole',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('role_type', models.CharField(choices=[('INCIDENT_COMMANDER', 'Incident Commander'), ('SCRIBE', 'Scribe'), ('COMMS_LEAD', 'Communications Lead'), ('TECHNICAL_LEAD', 'Technical Lead'), ('BUSINESS_LEAD', 'Business Lead'), ('EXTERNAL_LIAISON', 'External Liaison'), ('OBSERVER', 'Observer')], max_length=30)),
('status', models.CharField(choices=[('ACTIVE', 'Active'), ('INACTIVE', 'Inactive'), ('REASSIGNED', 'Reassigned')], default='ACTIVE', max_length=20)),
('responsibilities', models.JSONField(default=list, help_text='List of responsibilities for this role')),
('decision_authority', models.JSONField(default=list, help_text='Areas where this role has decision authority')),
('assigned_at', models.DateTimeField(auto_now_add=True)),
('reassigned_at', models.DateTimeField(blank=True, null=True)),
('assignment_notes', models.TextField(blank=True, null=True)),
('decisions_made', models.PositiveIntegerField(default=0)),
('communications_sent', models.PositiveIntegerField(default=0)),
('last_activity', models.DateTimeField(blank=True, null=True)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('assigned_user', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='assigned_command_roles', to=settings.AUTH_USER_MODEL)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_command_roles', to=settings.AUTH_USER_MODEL)),
('incident', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='command_roles', to='incident_intelligence.incident')),
('reassigned_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='reassigned_command_roles', to=settings.AUTH_USER_MODEL)),
],
options={
'ordering': ['-assigned_at'],
},
),
migrations.CreateModel(
name='WarRoom',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=200)),
('description', models.TextField(blank=True, null=True)),
('status', models.CharField(choices=[('ACTIVE', 'Active'), ('ARCHIVED', 'Archived'), ('CLOSED', 'Closed')], default='ACTIVE', max_length=20)),
('privacy_level', models.CharField(choices=[('PUBLIC', 'Public'), ('PRIVATE', 'Private'), ('RESTRICTED', 'Restricted')], default='PRIVATE', max_length=20)),
('slack_channel_id', models.CharField(blank=True, help_text='Slack channel ID', max_length=100, null=True)),
('teams_channel_id', models.CharField(blank=True, help_text='Teams channel ID', max_length=100, null=True)),
('discord_channel_id', models.CharField(blank=True, help_text='Discord channel ID', max_length=100, null=True)),
('message_count', models.PositiveIntegerField(default=0)),
('last_activity', models.DateTimeField(blank=True, null=True)),
('active_participants', models.PositiveIntegerField(default=0)),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('archived_at', models.DateTimeField(blank=True, null=True)),
('allowed_users', models.ManyToManyField(blank=True, help_text='Users with access to this war room', related_name='accessible_war_rooms', to=settings.AUTH_USER_MODEL)),
('created_by', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_war_rooms', to=settings.AUTH_USER_MODEL)),
('incident', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='war_rooms', to='incident_intelligence.incident')),
('required_clearance_level', models.ForeignKey(blank=True, help_text='Required clearance level for access', null=True, on_delete=django.db.models.deletion.SET_NULL, to='security.dataclassification')),
],
options={
'ordering': ['-created_at'],
},
),
migrations.CreateModel(
name='TimelineEvent',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('event_type', models.CharField(choices=[('INCIDENT_CREATED', 'Incident Created'), ('INCIDENT_UPDATED', 'Incident Updated'), ('ASSIGNMENT_CHANGED', 'Assignment Changed'), ('STATUS_CHANGED', 'Status Changed'), ('SEVERITY_CHANGED', 'Severity Changed'), ('COMMENT_ADDED', 'Comment Added'), ('RUNBOOK_EXECUTED', 'Runbook Executed'), ('AUTO_REMEDIATION_ATTEMPTED', 'Auto-remediation Attempted'), ('SLA_BREACHED', 'SLA Breached'), ('ESCALATION_TRIGGERED', 'Escalation Triggered'), ('WAR_ROOM_CREATED', 'War Room Created'), ('CONFERENCE_STARTED', 'Conference Started'), ('COMMAND_ROLE_ASSIGNED', 'Command Role Assigned'), ('DECISION_MADE', 'Decision Made'), ('COMMUNICATION_SENT', 'Communication Sent'), ('EXTERNAL_INTEGRATION', 'External Integration'), ('MANUAL_EVENT', 'Manual Event')], max_length=30)),
('title', models.CharField(max_length=200)),
('description', models.TextField()),
('source_type', models.CharField(choices=[('SYSTEM', 'System Generated'), ('USER', 'User Created'), ('INTEGRATION', 'External Integration'), ('AUTOMATION', 'Automation')], default='SYSTEM', max_length=20)),
('event_time', models.DateTimeField(help_text='When the event occurred')),
('created_at', models.DateTimeField(auto_now_add=True)),
('event_data', models.JSONField(default=dict, help_text='Additional data related to the event')),
('tags', models.JSONField(default=list, help_text='Tags for categorization and filtering')),
('is_critical_event', models.BooleanField(default=False, help_text='Whether this event is critical for postmortem analysis')),
('postmortem_notes', models.TextField(blank=True, help_text='Additional notes added during postmortem', null=True)),
('created_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='created_timeline_events', to=settings.AUTH_USER_MODEL)),
('incident', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='timeline_events', to='incident_intelligence.incident')),
('related_auto_remediation', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='timeline_events', to='automation_orchestration.autoremediationexecution')),
('related_command_role', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='timeline_events', to='collaboration_war_rooms.incidentcommandrole')),
('related_conference', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='timeline_events', to='collaboration_war_rooms.conferencebridge')),
('related_escalation', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='timeline_events', to='sla_oncall.escalationinstance')),
('related_runbook_execution', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='timeline_events', to='automation_orchestration.runbookexecution')),
('related_sla_instance', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='timeline_events', to='sla_oncall.slainstance')),
('related_user', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='timeline_events', to=settings.AUTH_USER_MODEL)),
('related_war_room', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='timeline_events', to='collaboration_war_rooms.warroom')),
],
options={
'ordering': ['event_time', 'created_at'],
},
),
migrations.AddField(
model_name='incidentcommandrole',
name='war_room',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='command_roles', to='collaboration_war_rooms.warroom'),
),
migrations.AddField(
model_name='conferencebridge',
name='war_room',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='conference_bridges', to='collaboration_war_rooms.warroom'),
),
migrations.CreateModel(
name='WarRoomMessage',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('message_type', models.CharField(choices=[('TEXT', 'Text Message'), ('SYSTEM', 'System Message'), ('COMMAND', 'Command Message'), ('ALERT', 'Alert Message'), ('UPDATE', 'Status Update')], default='TEXT', max_length=20)),
('content', models.TextField()),
('sender_name', models.CharField(help_text='Display name of sender', max_length=100)),
('is_edited', models.BooleanField(default=False)),
('edited_at', models.DateTimeField(blank=True, null=True)),
('external_message_id', models.CharField(blank=True, help_text='ID in external system (Slack, Teams, etc.)', max_length=255, null=True)),
('external_data', models.JSONField(default=dict, help_text='Additional data from external system')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('reply_to', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='replies', to='collaboration_war_rooms.warroommessage')),
('sender', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='war_room_messages', to=settings.AUTH_USER_MODEL)),
('war_room', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='messages', to='collaboration_war_rooms.warroom')),
],
options={
'ordering': ['created_at'],
},
),
migrations.CreateModel(
name='IncidentDecision',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('decision_type', models.CharField(choices=[('TECHNICAL', 'Technical Decision'), ('BUSINESS', 'Business Decision'), ('COMMUNICATION', 'Communication Decision'), ('ESCALATION', 'Escalation Decision'), ('RESOURCE', 'Resource Allocation'), ('TIMELINE', 'Timeline Decision')], max_length=20)),
('title', models.CharField(max_length=200)),
('description', models.TextField()),
('rationale', models.TextField(help_text='Reasoning behind the decision')),
('status', models.CharField(choices=[('PENDING', 'Pending'), ('APPROVED', 'Approved'), ('REJECTED', 'Rejected'), ('IMPLEMENTED', 'Implemented')], default='PENDING', max_length=20)),
('requires_approval', models.BooleanField(default=False)),
('approved_at', models.DateTimeField(blank=True, null=True)),
('implementation_notes', models.TextField(blank=True, null=True)),
('implemented_at', models.DateTimeField(blank=True, null=True)),
('impact_assessment', models.TextField(blank=True, null=True)),
('success_metrics', models.JSONField(default=list, help_text='Metrics to measure decision success')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
('approved_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='approved_decisions', to=settings.AUTH_USER_MODEL)),
('command_role', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='decisions', to='collaboration_war_rooms.incidentcommandrole')),
('implemented_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='implemented_decisions', to=settings.AUTH_USER_MODEL)),
('incident', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='decisions', to='incident_intelligence.incident')),
],
options={
'ordering': ['-created_at'],
'indexes': [models.Index(fields=['incident', 'status'], name='collaborati_inciden_4f34eb_idx'), models.Index(fields=['command_role', 'decision_type'], name='collaborati_command_81be71_idx'), models.Index(fields=['status', 'created_at'], name='collaborati_status_3f5734_idx')],
},
),
migrations.AddIndex(
model_name='warroom',
index=models.Index(fields=['incident', 'status'], name='collaborati_inciden_bd58db_idx'),
),
migrations.AddIndex(
model_name='warroom',
index=models.Index(fields=['status', 'privacy_level'], name='collaborati_status_649ccc_idx'),
),
migrations.AddIndex(
model_name='warroom',
index=models.Index(fields=['created_at'], name='collaborati_created_e3a240_idx'),
),
migrations.AddIndex(
model_name='timelineevent',
index=models.Index(fields=['incident', 'event_time'], name='collaborati_inciden_3a611f_idx'),
),
migrations.AddIndex(
model_name='timelineevent',
index=models.Index(fields=['event_type', 'event_time'], name='collaborati_event_t_d2100a_idx'),
),
migrations.AddIndex(
model_name='timelineevent',
index=models.Index(fields=['source_type', 'event_time'], name='collaborati_source__0c3cc4_idx'),
),
migrations.AddIndex(
model_name='timelineevent',
index=models.Index(fields=['is_critical_event', 'event_time'], name='collaborati_is_crit_28e610_idx'),
),
migrations.AddIndex(
model_name='incidentcommandrole',
index=models.Index(fields=['incident', 'role_type'], name='collaborati_inciden_7c5ba6_idx'),
),
migrations.AddIndex(
model_name='incidentcommandrole',
index=models.Index(fields=['assigned_user', 'status'], name='collaborati_assigne_e33d48_idx'),
),
migrations.AddIndex(
model_name='incidentcommandrole',
index=models.Index(fields=['status', 'assigned_at'], name='collaborati_status_b2ec4b_idx'),
),
migrations.AlterUniqueTogether(
name='incidentcommandrole',
unique_together={('incident', 'role_type', 'assigned_user')},
),
migrations.AddIndex(
model_name='conferencebridge',
index=models.Index(fields=['incident', 'status'], name='collaborati_inciden_4be2c2_idx'),
),
migrations.AddIndex(
model_name='conferencebridge',
index=models.Index(fields=['bridge_type', 'status'], name='collaborati_bridge__44a9ea_idx'),
),
migrations.AddIndex(
model_name='conferencebridge',
index=models.Index(fields=['scheduled_start'], name='collaborati_schedul_a93d14_idx'),
),
migrations.AddIndex(
model_name='warroommessage',
index=models.Index(fields=['war_room', 'created_at'], name='collaborati_war_roo_6320f9_idx'),
),
migrations.AddIndex(
model_name='warroommessage',
index=models.Index(fields=['sender', 'created_at'], name='collaborati_sender__f499b1_idx'),
),
migrations.AddIndex(
model_name='warroommessage',
index=models.Index(fields=['message_type', 'created_at'], name='collaborati_message_a29f3d_idx'),
),
]

View File

@@ -0,0 +1,215 @@
# Generated by Django 5.2.6 on 2025-09-18 18:10
import django.db.models.deletion
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('automation_orchestration', '0002_autoremediationexecution_sla_instance_and_more'),
('collaboration_war_rooms', '0001_initial'),
('knowledge_learning', '0001_initial'),
('security', '0003_adaptiveauthentication_and_more'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='ChatBot',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('name', models.CharField(max_length=100)),
('bot_type', models.CharField(choices=[('INCIDENT_ASSISTANT', 'Incident Assistant'), ('KNOWLEDGE_BOT', 'Knowledge Bot'), ('AUTOMATION_BOT', 'Automation Bot'), ('COMPLIANCE_BOT', 'Compliance Bot')], max_length=30)),
('description', models.TextField()),
('is_active', models.BooleanField(default=True)),
('auto_respond', models.BooleanField(default=False)),
('response_triggers', models.JSONField(default=list, help_text='Keywords that trigger bot responses')),
('created_at', models.DateTimeField(auto_now_add=True)),
('updated_at', models.DateTimeField(auto_now=True)),
],
options={
'ordering': ['name'],
},
),
migrations.CreateModel(
name='ChatCommand',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('command_type', models.CharField(choices=[('STATUS', 'Status Check'), ('RUNBOOK', 'Execute Runbook'), ('ESCALATE', 'Escalate Incident'), ('ASSIGN', 'Assign Incident'), ('UPDATE', 'Update Status'), ('CUSTOM', 'Custom Command')], max_length=20)),
('command_text', models.CharField(help_text='Full command text', max_length=500)),
('parameters', models.JSONField(default=dict, help_text='Parsed command parameters')),
('executed_at', models.DateTimeField(blank=True, null=True)),
('execution_status', models.CharField(choices=[('PENDING', 'Pending'), ('EXECUTING', 'Executing'), ('SUCCESS', 'Success'), ('FAILED', 'Failed'), ('CANCELLED', 'Cancelled')], default='PENDING', max_length=20)),
('execution_result', models.JSONField(default=dict, help_text='Result of command execution')),
('error_message', models.TextField(blank=True, null=True)),
],
options={
'ordering': ['-executed_at'],
},
),
migrations.CreateModel(
name='ChatFile',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('filename', models.CharField(max_length=255)),
('original_filename', models.CharField(max_length=255)),
('file_type', models.CharField(choices=[('IMAGE', 'Image'), ('DOCUMENT', 'Document'), ('LOG', 'Log File'), ('SCREENSHOT', 'Screenshot'), ('EVIDENCE', 'Evidence'), ('OTHER', 'Other')], max_length=20)),
('file_size', models.PositiveIntegerField(help_text='File size in bytes')),
('mime_type', models.CharField(max_length=100)),
('file_path', models.CharField(help_text='Path to stored file', max_length=500)),
('file_url', models.URLField(blank=True, help_text='Public URL for file access', null=True)),
('is_encrypted', models.BooleanField(default=False)),
('encryption_key_id', models.CharField(blank=True, max_length=255, null=True)),
('file_hash', models.CharField(help_text='SHA-256 hash of file', max_length=64)),
('uploaded_at', models.DateTimeField(auto_now_add=True)),
('access_log', models.JSONField(default=list, help_text='Log of who accessed this file and when')),
],
options={
'ordering': ['-uploaded_at'],
},
),
migrations.CreateModel(
name='MessageReaction',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('emoji', models.CharField(help_text='Emoji reaction', max_length=10)),
('created_at', models.DateTimeField(auto_now_add=True)),
],
options={
'ordering': ['created_at'],
},
),
migrations.AddField(
model_name='warroommessage',
name='attachments',
field=models.JSONField(default=list, help_text='List of file attachments with metadata'),
),
migrations.AddField(
model_name='warroommessage',
name='encryption_key_id',
field=models.CharField(blank=True, max_length=255, null=True),
),
migrations.AddField(
model_name='warroommessage',
name='is_encrypted',
field=models.BooleanField(default=False),
),
migrations.AddField(
model_name='warroommessage',
name='is_pinned',
field=models.BooleanField(default=False, help_text='Whether this message is pinned'),
),
migrations.AddField(
model_name='warroommessage',
name='mentioned_users',
field=models.ManyToManyField(blank=True, help_text='Users mentioned in this message', related_name='mentioned_in_messages', to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='warroommessage',
name='notification_sent',
field=models.BooleanField(default=False),
),
migrations.AddField(
model_name='warroommessage',
name='pinned_at',
field=models.DateTimeField(blank=True, null=True),
),
migrations.AddField(
model_name='warroommessage',
name='pinned_by',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='pinned_messages', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='warroommessage',
name='message_type',
field=models.CharField(choices=[('TEXT', 'Text Message'), ('SYSTEM', 'System Message'), ('COMMAND', 'Command Message'), ('ALERT', 'Alert Message'), ('UPDATE', 'Status Update'), ('FILE', 'File Attachment'), ('BOT', 'Bot Message')], default='TEXT', max_length=20),
),
migrations.AddIndex(
model_name='warroommessage',
index=models.Index(fields=['is_pinned', 'created_at'], name='collaborati_is_pinn_7a25dc_idx'),
),
migrations.AddField(
model_name='chatbot',
name='knowledge_base',
field=models.ForeignKey(blank=True, help_text='Knowledge base article for bot responses', null=True, on_delete=django.db.models.deletion.SET_NULL, to='knowledge_learning.knowledgebasearticle'),
),
migrations.AddField(
model_name='chatcommand',
name='automation_execution',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='chat_commands', to='automation_orchestration.runbookexecution'),
),
migrations.AddField(
model_name='chatcommand',
name='executed_by',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='chatcommand',
name='message',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='chat_commands', to='collaboration_war_rooms.warroommessage'),
),
migrations.AddField(
model_name='chatfile',
name='data_classification',
field=models.ForeignKey(blank=True, help_text='Data classification level for this file', null=True, on_delete=django.db.models.deletion.SET_NULL, to='security.dataclassification'),
),
migrations.AddField(
model_name='chatfile',
name='message',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='chat_files', to='collaboration_war_rooms.warroommessage'),
),
migrations.AddField(
model_name='chatfile',
name='uploaded_by',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='messagereaction',
name='message',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='reactions', to='collaboration_war_rooms.warroommessage'),
),
migrations.AddField(
model_name='messagereaction',
name='user',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddIndex(
model_name='chatbot',
index=models.Index(fields=['bot_type', 'is_active'], name='collaborati_bot_typ_6fc3ba_idx'),
),
migrations.AddIndex(
model_name='chatcommand',
index=models.Index(fields=['command_type', 'execution_status'], name='collaborati_command_915116_idx'),
),
migrations.AddIndex(
model_name='chatcommand',
index=models.Index(fields=['executed_by', 'executed_at'], name='collaborati_execute_d1badb_idx'),
),
migrations.AddIndex(
model_name='chatfile',
index=models.Index(fields=['message', 'file_type'], name='collaborati_message_358b62_idx'),
),
migrations.AddIndex(
model_name='chatfile',
index=models.Index(fields=['data_classification'], name='collaborati_data_cl_e26657_idx'),
),
migrations.AddIndex(
model_name='chatfile',
index=models.Index(fields=['uploaded_at'], name='collaborati_uploade_a6b9bd_idx'),
),
migrations.AddIndex(
model_name='messagereaction',
index=models.Index(fields=['message', 'emoji'], name='collaborati_message_817163_idx'),
),
migrations.AddIndex(
model_name='messagereaction',
index=models.Index(fields=['user', 'created_at'], name='collaborati_user_id_2d3a22_idx'),
),
migrations.AlterUniqueTogether(
name='messagereaction',
unique_together={('message', 'user', 'emoji')},
),
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,9 @@
"""
WebSocket routing configuration for collaboration_war_rooms
"""
from django.urls import re_path
from . import consumers
websocket_urlpatterns = [
re_path(r'ws/chat/(?P<room_id>[0-9a-f-]+)/$', consumers.ChatConsumer.as_asgi()),
]

Some files were not shown because too many files have changed in this diff Show More