"""
HTML/XSS sanitization utilities using bleach library.
Prevents stored XSS attacks by sanitizing user-generated content.
"""
import bleach
from typing import Optional
# Allowed HTML tags for rich text content
ALLOWED_TAGS = [
'p', 'br', 'strong', 'em', 'u', 'b', 'i', 's', 'strike',
'a', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'blockquote', 'pre', 'code', 'hr', 'div', 'span',
'table', 'thead', 'tbody', 'tr', 'th', 'td',
'img'
]
# Allowed attributes for specific tags
ALLOWED_ATTRIBUTES = {
'a': ['href', 'title', 'target', 'rel'],
'img': ['src', 'alt', 'title', 'width', 'height'],
'div': ['class'],
'span': ['class'],
'p': ['class'],
'table': ['class', 'border'],
'th': ['colspan', 'rowspan'],
'td': ['colspan', 'rowspan']
}
# Allowed URL schemes
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
# Allowed CSS classes (optional - can be expanded)
ALLOWED_STYLES = []
def sanitize_html(content: Optional[str], strip: bool = False) -> str:
"""
Sanitize HTML content to prevent XSS attacks.
Args:
content: The HTML content to sanitize (can be None)
strip: If True, remove disallowed tags instead of escaping them
Returns:
Sanitized HTML string
"""
if not content:
return ''
if not isinstance(content, str):
content = str(content)
# Sanitize HTML
sanitized = bleach.clean(
content,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
protocols=ALLOWED_PROTOCOLS,
strip=strip,
strip_comments=True
)
# Additional link sanitization - ensure external links have rel="noopener noreferrer"
if '
return tag[:-1] + ' rel="noopener noreferrer">'
elif 'noopener' not in tag and 'noreferrer' not in tag:
# Add to existing rel attribute
tag = tag.replace('rel="', 'rel="noopener noreferrer ')
tag = tag.replace("rel='", "rel='noopener noreferrer ")
return tag
return tag
sanitized = re.sub(r']*>', add_rel, sanitized)
# Linkify URLs (convert plain URLs to links)
# Only linkify if content doesn't already contain HTML links
if ' str:
"""
Strip all HTML tags from content, leaving only plain text.
Useful for fields that should not contain any HTML.
Alias for sanitize_text_for_html for backward compatibility.
Args:
content: The content to sanitize (can be None)
Returns:
Plain text string with all HTML removed
"""
if not content:
return ''
if not isinstance(content, str):
content = str(content)
# Strip all HTML tags
return bleach.clean(content, tags=[], strip=True)
def sanitize_text_for_html(text: Optional[str]) -> str:
"""
Escape text content to be safely included in HTML.
Use this for plain text that should be displayed as-is.
Alias for sanitize_text for consistency.
Args:
text: Plain text string to escape
Returns:
HTML-escaped string
"""
return sanitize_text(text)
def sanitize_filename(filename: str) -> str:
"""
Sanitize filename to prevent path traversal and other attacks.
Args:
filename: The original filename
Returns:
Sanitized filename safe for filesystem operations
"""
import os
import secrets
from pathlib import Path
if not filename:
# Generate a random filename if none provided
return f"{secrets.token_urlsafe(16)}.bin"
# Remove path components (prevent directory traversal)
filename = os.path.basename(filename)
# Remove dangerous characters
# Keep only alphanumeric, dots, dashes, and underscores
safe_chars = []
for char in filename:
if char.isalnum() or char in '._-':
safe_chars.append(char)
else:
safe_chars.append('_')
filename = ''.join(safe_chars)
# Limit length (filesystem limit is typically 255)
if len(filename) > 255:
name, ext = os.path.splitext(filename)
max_name_length = 255 - len(ext)
filename = name[:max_name_length] + ext
# Ensure filename is not empty
if not filename or filename == '.' or filename == '..':
filename = f"{secrets.token_urlsafe(16)}.bin"
return filename
def sanitize_url(url: Optional[str]) -> Optional[str]:
"""
Sanitize URL to ensure it uses allowed protocols.
Args:
url: The URL to sanitize
Returns:
Sanitized URL or None if invalid
"""
if not url:
return None
if not isinstance(url, str):
url = str(url)
# Check if URL uses allowed protocol
url_lower = url.lower().strip()
if any(url_lower.startswith(proto + ':') for proto in ALLOWED_PROTOCOLS):
return url
# If no protocol, assume https
if '://' not in url:
return f'https://{url}'
# Invalid protocol - return None
return None