Hotel-Booking/Backend/src/shared/utils/sanitization.py

"""
HTML/XSS sanitization utilities using bleach library.
Prevents stored XSS attacks by sanitizing user-generated content.
"""
import bleach
from typing import Optional

# Allowed HTML tags for rich text content
ALLOWED_TAGS = [
    'p', 'br', 'strong', 'em', 'u', 'b', 'i', 's', 'strike',
    'a', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
    'blockquote', 'pre', 'code', 'hr', 'div', 'span',
    'table', 'thead', 'tbody', 'tr', 'th', 'td',
    'img'
]

# Allowed attributes for specific tags
ALLOWED_ATTRIBUTES = {
    'a': ['href', 'title', 'target', 'rel'],
    'img': ['src', 'alt', 'title', 'width', 'height'],
    'div': ['class'],
    'span': ['class'],
    'p': ['class'],
    'table': ['class', 'border'],
    'th': ['colspan', 'rowspan'],
    'td': ['colspan', 'rowspan']
}

# Allowed URL schemes
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']

# Allowed CSS classes (optional - can be expanded)
ALLOWED_STYLES = []


def sanitize_html(content: Optional[str], strip: bool = False) -> str:
    """
    Sanitize HTML content to prevent XSS attacks.

    Args:
        content: The HTML content to sanitize (can be None)
        strip: If True, remove disallowed tags instead of escaping them

    Returns:
        Sanitized HTML string
    """
    if not content:
        return ''

    if not isinstance(content, str):
        content = str(content)

    # Sanitize HTML
    sanitized = bleach.clean(
        content,
        tags=ALLOWED_TAGS,
        attributes=ALLOWED_ATTRIBUTES,
        protocols=ALLOWED_PROTOCOLS,
        strip=strip,
        strip_comments=True
    )

    # Additional link sanitization - ensure external links have rel="noopener noreferrer"
    if '<a' in sanitized:
        import re
        # Add rel="noopener noreferrer" to external links
        def add_rel(match):
            tag = match.group(0)
            if 'href=' in tag and ('http://' in tag or 'https://' in tag):
                if 'rel=' not in tag:
                    # Insert rel attribute before closing >
                    return tag[:-1] + ' rel="noopener noreferrer">'
                elif 'noopener' not in tag and 'noreferrer' not in tag:
                    # Add to existing rel attribute
                    tag = tag.replace('rel="', 'rel="noopener noreferrer ')
                    tag = tag.replace("rel='", "rel='noopener noreferrer ")
                    return tag
            return tag

        sanitized = re.sub(r'<a[^>]*>', add_rel, sanitized)

    # Linkify URLs (convert plain URLs to links)
    # Only linkify if content doesn't already contain HTML links
    if '<a' not in sanitized:
        sanitized = bleach.linkify(
            sanitized,
            protocols=ALLOWED_PROTOCOLS,
            parse_email=True
        )

    return sanitized


def sanitize_text(content: Optional[str]) -> str:
    """
    Strip all HTML tags from content, leaving only plain text.
    Useful for fields that should not contain any HTML.
    Alias for sanitize_text_for_html for backward compatibility.

    Args:
        content: The content to sanitize (can be None)

    Returns:
        Plain text string with all HTML removed
    """
    if not content:
        return ''

    if not isinstance(content, str):
        content = str(content)

    # Strip all HTML tags
    return bleach.clean(content, tags=[], strip=True)


def sanitize_text_for_html(text: Optional[str]) -> str:
    """
    Escape text content to be safely included in HTML.
    Use this for plain text that should be displayed as-is.
    Alias for sanitize_text for consistency.

    Args:
        text: Plain text string to escape

    Returns:
        HTML-escaped string
    """
    return sanitize_text(text)


def sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to prevent path traversal and other attacks.

    Args:
        filename: The original filename

    Returns:
        Sanitized filename safe for filesystem operations
    """
    import os
    import secrets
    from pathlib import Path

    if not filename:
        # Generate a random filename if none provided
        return f"{secrets.token_urlsafe(16)}.bin"

    # Remove path components (prevent directory traversal)
    filename = os.path.basename(filename)

    # Remove dangerous characters
    # Keep only alphanumeric, dots, dashes, and underscores
    safe_chars = []
    for char in filename:
        if char.isalnum() or char in '._-':
            safe_chars.append(char)
        else:
            safe_chars.append('_')

    filename = ''.join(safe_chars)

    # Limit length (filesystem limit is typically 255)
    if len(filename) > 255:
        name, ext = os.path.splitext(filename)
        max_name_length = 255 - len(ext)
        filename = name[:max_name_length] + ext

    # Ensure filename is not empty
    if not filename or filename == '.' or filename == '..':
        filename = f"{secrets.token_urlsafe(16)}.bin"

    return filename


def sanitize_url(url: Optional[str]) -> Optional[str]:
    """
    Sanitize URL to ensure it uses allowed protocols.

    Args:
        url: The URL to sanitize

    Returns:
        Sanitized URL or None if invalid
    """
    if not url:
        return None

    if not isinstance(url, str):
        url = str(url)

    # Check if URL uses allowed protocol
    url_lower = url.lower().strip()
    if any(url_lower.startswith(proto + ':') for proto in ALLOWED_PROTOCOLS):
        return url

    # If no protocol, assume https
    if '://' not in url:
        return f'https://{url}'

    # Invalid protocol - return None
    return None