204 lines
5.6 KiB
Python
204 lines
5.6 KiB
Python
"""
|
|
HTML/XSS sanitization utilities using bleach library.
|
|
Prevents stored XSS attacks by sanitizing user-generated content.
|
|
"""
|
|
import bleach
|
|
from typing import Optional
|
|
|
|
# Allowed HTML tags for rich text content
|
|
ALLOWED_TAGS = [
|
|
'p', 'br', 'strong', 'em', 'u', 'b', 'i', 's', 'strike',
|
|
'a', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
'blockquote', 'pre', 'code', 'hr', 'div', 'span',
|
|
'table', 'thead', 'tbody', 'tr', 'th', 'td',
|
|
'img'
|
|
]
|
|
|
|
# Allowed attributes for specific tags
|
|
ALLOWED_ATTRIBUTES = {
|
|
'a': ['href', 'title', 'target', 'rel'],
|
|
'img': ['src', 'alt', 'title', 'width', 'height'],
|
|
'div': ['class'],
|
|
'span': ['class'],
|
|
'p': ['class'],
|
|
'table': ['class', 'border'],
|
|
'th': ['colspan', 'rowspan'],
|
|
'td': ['colspan', 'rowspan']
|
|
}
|
|
|
|
# Allowed URL schemes
|
|
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
|
|
|
|
# Allowed CSS classes (optional - can be expanded)
|
|
ALLOWED_STYLES = []
|
|
|
|
|
|
def sanitize_html(content: Optional[str], strip: bool = False) -> str:
|
|
"""
|
|
Sanitize HTML content to prevent XSS attacks.
|
|
|
|
Args:
|
|
content: The HTML content to sanitize (can be None)
|
|
strip: If True, remove disallowed tags instead of escaping them
|
|
|
|
Returns:
|
|
Sanitized HTML string
|
|
"""
|
|
if not content:
|
|
return ''
|
|
|
|
if not isinstance(content, str):
|
|
content = str(content)
|
|
|
|
# Sanitize HTML
|
|
sanitized = bleach.clean(
|
|
content,
|
|
tags=ALLOWED_TAGS,
|
|
attributes=ALLOWED_ATTRIBUTES,
|
|
protocols=ALLOWED_PROTOCOLS,
|
|
strip=strip,
|
|
strip_comments=True
|
|
)
|
|
|
|
# Additional link sanitization - ensure external links have rel="noopener noreferrer"
|
|
if '<a' in sanitized:
|
|
import re
|
|
# Add rel="noopener noreferrer" to external links
|
|
def add_rel(match):
|
|
tag = match.group(0)
|
|
if 'href=' in tag and ('http://' in tag or 'https://' in tag):
|
|
if 'rel=' not in tag:
|
|
# Insert rel attribute before closing >
|
|
return tag[:-1] + ' rel="noopener noreferrer">'
|
|
elif 'noopener' not in tag and 'noreferrer' not in tag:
|
|
# Add to existing rel attribute
|
|
tag = tag.replace('rel="', 'rel="noopener noreferrer ')
|
|
tag = tag.replace("rel='", "rel='noopener noreferrer ")
|
|
return tag
|
|
return tag
|
|
|
|
sanitized = re.sub(r'<a[^>]*>', add_rel, sanitized)
|
|
|
|
# Linkify URLs (convert plain URLs to links)
|
|
# Only linkify if content doesn't already contain HTML links
|
|
if '<a' not in sanitized:
|
|
sanitized = bleach.linkify(
|
|
sanitized,
|
|
protocols=ALLOWED_PROTOCOLS,
|
|
parse_email=True
|
|
)
|
|
|
|
return sanitized
|
|
|
|
|
|
def sanitize_text(content: Optional[str]) -> str:
|
|
"""
|
|
Strip all HTML tags from content, leaving only plain text.
|
|
Useful for fields that should not contain any HTML.
|
|
Alias for sanitize_text_for_html for backward compatibility.
|
|
|
|
Args:
|
|
content: The content to sanitize (can be None)
|
|
|
|
Returns:
|
|
Plain text string with all HTML removed
|
|
"""
|
|
if not content:
|
|
return ''
|
|
|
|
if not isinstance(content, str):
|
|
content = str(content)
|
|
|
|
# Strip all HTML tags
|
|
return bleach.clean(content, tags=[], strip=True)
|
|
|
|
|
|
def sanitize_text_for_html(text: Optional[str]) -> str:
|
|
"""
|
|
Escape text content to be safely included in HTML.
|
|
Use this for plain text that should be displayed as-is.
|
|
Alias for sanitize_text for consistency.
|
|
|
|
Args:
|
|
text: Plain text string to escape
|
|
|
|
Returns:
|
|
HTML-escaped string
|
|
"""
|
|
return sanitize_text(text)
|
|
|
|
|
|
def sanitize_filename(filename: str) -> str:
|
|
"""
|
|
Sanitize filename to prevent path traversal and other attacks.
|
|
|
|
Args:
|
|
filename: The original filename
|
|
|
|
Returns:
|
|
Sanitized filename safe for filesystem operations
|
|
"""
|
|
import os
|
|
import secrets
|
|
from pathlib import Path
|
|
|
|
if not filename:
|
|
# Generate a random filename if none provided
|
|
return f"{secrets.token_urlsafe(16)}.bin"
|
|
|
|
# Remove path components (prevent directory traversal)
|
|
filename = os.path.basename(filename)
|
|
|
|
# Remove dangerous characters
|
|
# Keep only alphanumeric, dots, dashes, and underscores
|
|
safe_chars = []
|
|
for char in filename:
|
|
if char.isalnum() or char in '._-':
|
|
safe_chars.append(char)
|
|
else:
|
|
safe_chars.append('_')
|
|
|
|
filename = ''.join(safe_chars)
|
|
|
|
# Limit length (filesystem limit is typically 255)
|
|
if len(filename) > 255:
|
|
name, ext = os.path.splitext(filename)
|
|
max_name_length = 255 - len(ext)
|
|
filename = name[:max_name_length] + ext
|
|
|
|
# Ensure filename is not empty
|
|
if not filename or filename == '.' or filename == '..':
|
|
filename = f"{secrets.token_urlsafe(16)}.bin"
|
|
|
|
return filename
|
|
|
|
|
|
def sanitize_url(url: Optional[str]) -> Optional[str]:
|
|
"""
|
|
Sanitize URL to ensure it uses allowed protocols.
|
|
|
|
Args:
|
|
url: The URL to sanitize
|
|
|
|
Returns:
|
|
Sanitized URL or None if invalid
|
|
"""
|
|
if not url:
|
|
return None
|
|
|
|
if not isinstance(url, str):
|
|
url = str(url)
|
|
|
|
# Check if URL uses allowed protocol
|
|
url_lower = url.lower().strip()
|
|
if any(url_lower.startswith(proto + ':') for proto in ALLOWED_PROTOCOLS):
|
|
return url
|
|
|
|
# If no protocol, assume https
|
|
if '://' not in url:
|
|
return f'https://{url}'
|
|
|
|
# Invalid protocol - return None
|
|
return None
|
|
|