updates
This commit is contained in:
168
Backend/src/shared/utils/sanitization.py
Normal file
168
Backend/src/shared/utils/sanitization.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
HTML/XSS sanitization utilities using bleach library.
|
||||
Prevents stored XSS attacks by sanitizing user-generated content.
|
||||
"""
|
||||
import bleach
|
||||
from typing import Optional
|
||||
|
||||
# Allowed HTML tags for rich text content
|
||||
ALLOWED_TAGS = [
|
||||
'p', 'br', 'strong', 'em', 'u', 'b', 'i', 's', 'strike',
|
||||
'a', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
||||
'blockquote', 'pre', 'code', 'hr', 'div', 'span',
|
||||
'table', 'thead', 'tbody', 'tr', 'th', 'td',
|
||||
'img'
|
||||
]
|
||||
|
||||
# Allowed attributes for specific tags
|
||||
ALLOWED_ATTRIBUTES = {
|
||||
'a': ['href', 'title', 'target', 'rel'],
|
||||
'img': ['src', 'alt', 'title', 'width', 'height'],
|
||||
'div': ['class'],
|
||||
'span': ['class'],
|
||||
'p': ['class'],
|
||||
'table': ['class', 'border'],
|
||||
'th': ['colspan', 'rowspan'],
|
||||
'td': ['colspan', 'rowspan']
|
||||
}
|
||||
|
||||
# Allowed URL schemes
|
||||
ALLOWED_PROTOCOLS = ['http', 'https', 'mailto']
|
||||
|
||||
# Allowed CSS classes (optional - can be expanded)
|
||||
ALLOWED_STYLES = []
|
||||
|
||||
|
||||
def sanitize_html(content: Optional[str], strip: bool = False) -> str:
|
||||
"""
|
||||
Sanitize HTML content to prevent XSS attacks.
|
||||
|
||||
Args:
|
||||
content: The HTML content to sanitize (can be None)
|
||||
strip: If True, remove disallowed tags instead of escaping them
|
||||
|
||||
Returns:
|
||||
Sanitized HTML string
|
||||
"""
|
||||
if not content:
|
||||
return ''
|
||||
|
||||
if not isinstance(content, str):
|
||||
content = str(content)
|
||||
|
||||
# Sanitize HTML
|
||||
sanitized = bleach.clean(
|
||||
content,
|
||||
tags=ALLOWED_TAGS,
|
||||
attributes=ALLOWED_ATTRIBUTES,
|
||||
protocols=ALLOWED_PROTOCOLS,
|
||||
strip=strip,
|
||||
strip_comments=True
|
||||
)
|
||||
|
||||
# Linkify URLs (convert plain URLs to links)
|
||||
# Only linkify if content doesn't already contain HTML links
|
||||
if '<a' not in sanitized:
|
||||
sanitized = bleach.linkify(
|
||||
sanitized,
|
||||
protocols=ALLOWED_PROTOCOLS,
|
||||
parse_email=True
|
||||
)
|
||||
|
||||
return sanitized
|
||||
|
||||
|
||||
def sanitize_text(content: Optional[str]) -> str:
|
||||
"""
|
||||
Strip all HTML tags from content, leaving only plain text.
|
||||
Useful for fields that should not contain any HTML.
|
||||
|
||||
Args:
|
||||
content: The content to sanitize (can be None)
|
||||
|
||||
Returns:
|
||||
Plain text string with all HTML removed
|
||||
"""
|
||||
if not content:
|
||||
return ''
|
||||
|
||||
if not isinstance(content, str):
|
||||
content = str(content)
|
||||
|
||||
# Strip all HTML tags
|
||||
return bleach.clean(content, tags=[], strip=True)
|
||||
|
||||
|
||||
def sanitize_filename(filename: str) -> str:
|
||||
"""
|
||||
Sanitize filename to prevent path traversal and other attacks.
|
||||
|
||||
Args:
|
||||
filename: The original filename
|
||||
|
||||
Returns:
|
||||
Sanitized filename safe for filesystem operations
|
||||
"""
|
||||
import os
|
||||
import secrets
|
||||
from pathlib import Path
|
||||
|
||||
if not filename:
|
||||
# Generate a random filename if none provided
|
||||
return f"{secrets.token_urlsafe(16)}.bin"
|
||||
|
||||
# Remove path components (prevent directory traversal)
|
||||
filename = os.path.basename(filename)
|
||||
|
||||
# Remove dangerous characters
|
||||
# Keep only alphanumeric, dots, dashes, and underscores
|
||||
safe_chars = []
|
||||
for char in filename:
|
||||
if char.isalnum() or char in '._-':
|
||||
safe_chars.append(char)
|
||||
else:
|
||||
safe_chars.append('_')
|
||||
|
||||
filename = ''.join(safe_chars)
|
||||
|
||||
# Limit length (filesystem limit is typically 255)
|
||||
if len(filename) > 255:
|
||||
name, ext = os.path.splitext(filename)
|
||||
max_name_length = 255 - len(ext)
|
||||
filename = name[:max_name_length] + ext
|
||||
|
||||
# Ensure filename is not empty
|
||||
if not filename or filename == '.' or filename == '..':
|
||||
filename = f"{secrets.token_urlsafe(16)}.bin"
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def sanitize_url(url: Optional[str]) -> Optional[str]:
|
||||
"""
|
||||
Sanitize URL to ensure it uses allowed protocols.
|
||||
|
||||
Args:
|
||||
url: The URL to sanitize
|
||||
|
||||
Returns:
|
||||
Sanitized URL or None if invalid
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
if not isinstance(url, str):
|
||||
url = str(url)
|
||||
|
||||
# Check if URL uses allowed protocol
|
||||
url_lower = url.lower().strip()
|
||||
if any(url_lower.startswith(proto + ':') for proto in ALLOWED_PROTOCOLS):
|
||||
return url
|
||||
|
||||
# If no protocol, assume https
|
||||
if '://' not in url:
|
||||
return f'https://{url}'
|
||||
|
||||
# Invalid protocol - return None
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user