""" HTML/XSS sanitization utilities using bleach library. Prevents stored XSS attacks by sanitizing user-generated content. """ import bleach from typing import Optional # Allowed HTML tags for rich text content ALLOWED_TAGS = [ 'p', 'br', 'strong', 'em', 'u', 'b', 'i', 's', 'strike', 'a', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'pre', 'code', 'hr', 'div', 'span', 'table', 'thead', 'tbody', 'tr', 'th', 'td', 'img' ] # Allowed attributes for specific tags ALLOWED_ATTRIBUTES = { 'a': ['href', 'title', 'target', 'rel'], 'img': ['src', 'alt', 'title', 'width', 'height'], 'div': ['class'], 'span': ['class'], 'p': ['class'], 'table': ['class', 'border'], 'th': ['colspan', 'rowspan'], 'td': ['colspan', 'rowspan'] } # Allowed URL schemes ALLOWED_PROTOCOLS = ['http', 'https', 'mailto'] # Allowed CSS classes (optional - can be expanded) ALLOWED_STYLES = [] def sanitize_html(content: Optional[str], strip: bool = False) -> str: """ Sanitize HTML content to prevent XSS attacks. Args: content: The HTML content to sanitize (can be None) strip: If True, remove disallowed tags instead of escaping them Returns: Sanitized HTML string """ if not content: return '' if not isinstance(content, str): content = str(content) # Sanitize HTML sanitized = bleach.clean( content, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, protocols=ALLOWED_PROTOCOLS, strip=strip, strip_comments=True ) # Additional link sanitization - ensure external links have rel="noopener noreferrer" if '' elif 'noopener' not in tag and 'noreferrer' not in tag: # Add to existing rel attribute tag = tag.replace('rel="', 'rel="noopener noreferrer ') tag = tag.replace("rel='", "rel='noopener noreferrer ") return tag return tag sanitized = re.sub(r']*>', add_rel, sanitized) # Linkify URLs (convert plain URLs to links) # Only linkify if content doesn't already contain HTML links if ' str: """ Strip all HTML tags from content, leaving only plain text. Useful for fields that should not contain any HTML. Alias for sanitize_text_for_html for backward compatibility. Args: content: The content to sanitize (can be None) Returns: Plain text string with all HTML removed """ if not content: return '' if not isinstance(content, str): content = str(content) # Strip all HTML tags return bleach.clean(content, tags=[], strip=True) def sanitize_text_for_html(text: Optional[str]) -> str: """ Escape text content to be safely included in HTML. Use this for plain text that should be displayed as-is. Alias for sanitize_text for consistency. Args: text: Plain text string to escape Returns: HTML-escaped string """ return sanitize_text(text) def sanitize_filename(filename: str) -> str: """ Sanitize filename to prevent path traversal and other attacks. Args: filename: The original filename Returns: Sanitized filename safe for filesystem operations """ import os import secrets from pathlib import Path if not filename: # Generate a random filename if none provided return f"{secrets.token_urlsafe(16)}.bin" # Remove path components (prevent directory traversal) filename = os.path.basename(filename) # Remove dangerous characters # Keep only alphanumeric, dots, dashes, and underscores safe_chars = [] for char in filename: if char.isalnum() or char in '._-': safe_chars.append(char) else: safe_chars.append('_') filename = ''.join(safe_chars) # Limit length (filesystem limit is typically 255) if len(filename) > 255: name, ext = os.path.splitext(filename) max_name_length = 255 - len(ext) filename = name[:max_name_length] + ext # Ensure filename is not empty if not filename or filename == '.' or filename == '..': filename = f"{secrets.token_urlsafe(16)}.bin" return filename def sanitize_url(url: Optional[str]) -> Optional[str]: """ Sanitize URL to ensure it uses allowed protocols. Args: url: The URL to sanitize Returns: Sanitized URL or None if invalid """ if not url: return None if not isinstance(url, str): url = str(url) # Check if URL uses allowed protocol url_lower = url.lower().strip() if any(url_lower.startswith(proto + ':') for proto in ALLOWED_PROTOCOLS): return url # If no protocol, assume https if '://' not in url: return f'https://{url}' # Invalid protocol - return None return None