This commit is contained in:
Iliyan Angelov
2025-11-17 18:26:30 +02:00
parent 48353cde9c
commit 0c59fe1173
2535 changed files with 278997 additions and 2480 deletions

View File

@@ -0,0 +1,29 @@
"""add_capacity_room_size_view_to_rooms
Revision ID: 6a126cc5b23c
Revises: add_stripe_payment_method
Create Date: 2025-11-17 16:25:09.581786
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '6a126cc5b23c'
down_revision = 'add_stripe_payment_method'
branch_labels = None
depends_on = None
def upgrade() -> None:
# Add the three new columns to rooms table
op.add_column('rooms', sa.Column('capacity', sa.Integer(), nullable=True))
op.add_column('rooms', sa.Column('room_size', sa.String(length=50), nullable=True))
op.add_column('rooms', sa.Column('view', sa.String(length=100), nullable=True))
def downgrade() -> None:
# Remove the three columns from rooms table
op.drop_column('rooms', 'view')
op.drop_column('rooms', 'room_size')
op.drop_column('rooms', 'capacity')

View File

@@ -0,0 +1,60 @@
"""add_system_settings_table
Revision ID: 96c23dad405d
Revises: 59baf2338f8a
Create Date: 2025-11-17 11:51:28.369031
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import mysql
# revision identifiers, used by Alembic.
revision = '96c23dad405d'
down_revision = '59baf2338f8a'
branch_labels = None
depends_on = None
def upgrade() -> None:
# Create system_settings table (if it doesn't exist)
from sqlalchemy import inspect
bind = op.get_bind()
inspector = inspect(bind)
tables = inspector.get_table_names()
if 'system_settings' not in tables:
op.create_table('system_settings',
sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
sa.Column('key', sa.String(length=100), nullable=False),
sa.Column('value', sa.Text(), nullable=False),
sa.Column('description', sa.Text(), nullable=True),
sa.Column('updated_at', sa.DateTime(), nullable=False),
sa.Column('updated_by_id', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['updated_by_id'], ['users.id'], ),
sa.PrimaryKeyConstraint('id')
)
op.create_index(op.f('ix_system_settings_id'), 'system_settings', ['id'], unique=False)
op.create_index(op.f('ix_system_settings_key'), 'system_settings', ['key'], unique=True)
# Add currency column to users table (if it doesn't exist)
columns = [col['name'] for col in inspector.get_columns('users')]
if 'currency' not in columns:
op.add_column('users', sa.Column('currency', sa.String(length=3), nullable=False, server_default='VND'))
# ### end Alembic commands ###
def downgrade() -> None:
# Drop currency column from users table
try:
op.drop_column('users', 'currency')
except Exception:
# Column might not exist, skip
pass
# Drop system_settings table
op.drop_index(op.f('ix_system_settings_key'), table_name='system_settings')
op.drop_index(op.f('ix_system_settings_id'), table_name='system_settings')
op.drop_table('system_settings')
# ### end Alembic commands ###

View File

@@ -0,0 +1,50 @@
"""add_stripe_payment_method
Revision ID: add_stripe_payment_method
Revises: 96c23dad405d
Create Date: 2025-01-17 12:00:00.000000
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import mysql
# revision identifiers, used by Alembic.
revision = 'add_stripe_payment_method'
down_revision = '96c23dad405d'
branch_labels = None
depends_on = None
def upgrade() -> None:
# Note: MySQL ENUM modifications can be tricky.
# If payments table already has data with existing enum values,
# we need to preserve them when adding 'stripe'
# For MySQL, we need to alter the ENUM column to include the new value
# Check if we're using MySQL
bind = op.get_bind()
if bind.dialect.name == 'mysql':
# Alter the ENUM column to include 'stripe'
# This preserves existing values and adds 'stripe'
op.execute(
"ALTER TABLE payments MODIFY COLUMN payment_method ENUM('cash', 'credit_card', 'debit_card', 'bank_transfer', 'e_wallet', 'stripe') NOT NULL"
)
else:
# For other databases (PostgreSQL, SQLite), enum changes are handled differently
# For SQLite, this might not be needed as it doesn't enforce enum constraints
pass
# ### end Alembic commands ###
def downgrade() -> None:
# Remove 'stripe' from the ENUM (be careful if there are existing stripe payments)
bind = op.get_bind()
if bind.dialect.name == 'mysql':
# First, check if there are any stripe payments - if so, this will fail
# In production, you'd want to migrate existing stripe payments first
op.execute(
"ALTER TABLE payments MODIFY COLUMN payment_method ENUM('cash', 'credit_card', 'debit_card', 'bank_transfer', 'e_wallet') NOT NULL"
)
# ### end Alembic commands ###

View File

@@ -16,6 +16,7 @@ pillow==10.1.0
aiosmtplib==3.0.1
jinja2==3.1.2
alembic==1.12.1
stripe>=13.2.0
# Enterprise features (optional but recommended)
# redis==5.0.1 # Uncomment if using Redis caching

View File

@@ -91,6 +91,11 @@ class Settings(BaseSettings):
# Health Check
HEALTH_CHECK_INTERVAL: int = Field(default=30, description="Health check interval in seconds")
# Stripe Payment Gateway
STRIPE_SECRET_KEY: str = Field(default="", description="Stripe secret key")
STRIPE_PUBLISHABLE_KEY: str = Field(default="", description="Stripe publishable key")
STRIPE_WEBHOOK_SECRET: str = Field(default="", description="Stripe webhook secret")
@property
def database_url(self) -> str:
"""Construct database URL"""

View File

@@ -193,15 +193,17 @@ app.include_router(privacy_routes.router, prefix=settings.API_V1_PREFIX)
# Import and include other routes
from .routes import (
room_routes, booking_routes, payment_routes, banner_routes,
room_routes, booking_routes, payment_routes, invoice_routes, banner_routes,
favorite_routes, service_routes, promotion_routes, report_routes,
review_routes, user_routes, audit_routes, admin_privacy_routes
review_routes, user_routes, audit_routes, admin_privacy_routes,
system_settings_routes
)
# Legacy routes (maintain backward compatibility)
app.include_router(room_routes.router, prefix="/api")
app.include_router(booking_routes.router, prefix="/api")
app.include_router(payment_routes.router, prefix="/api")
app.include_router(invoice_routes.router, prefix="/api")
app.include_router(banner_routes.router, prefix="/api")
app.include_router(favorite_routes.router, prefix="/api")
app.include_router(service_routes.router, prefix="/api")
@@ -211,11 +213,13 @@ app.include_router(review_routes.router, prefix="/api")
app.include_router(user_routes.router, prefix="/api")
app.include_router(audit_routes.router, prefix="/api")
app.include_router(admin_privacy_routes.router, prefix="/api")
app.include_router(system_settings_routes.router, prefix="/api")
# Versioned routes (v1)
app.include_router(room_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(booking_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(payment_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(invoice_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(banner_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(favorite_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(service_routes.router, prefix=settings.API_V1_PREFIX)
@@ -225,6 +229,7 @@ app.include_router(review_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(user_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(audit_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(admin_privacy_routes.router, prefix=settings.API_V1_PREFIX)
app.include_router(system_settings_routes.router, prefix=settings.API_V1_PREFIX)
logger.info("All routes registered successfully")

View File

@@ -16,6 +16,8 @@ from .favorite import Favorite
from .audit_log import AuditLog
from .cookie_policy import CookiePolicy
from .cookie_integration_config import CookieIntegrationConfig
from .system_settings import SystemSettings
from .invoice import Invoice, InvoiceItem
__all__ = [
"Role",
@@ -36,5 +38,8 @@ __all__ = [
"AuditLog",
"CookiePolicy",
"CookieIntegrationConfig",
"SystemSettings",
"Invoice",
"InvoiceItem",
]

Binary file not shown.

View File

@@ -35,6 +35,7 @@ class Booking(Base):
user = relationship("User", back_populates="bookings")
room = relationship("Room", back_populates="bookings")
payments = relationship("Payment", back_populates="booking", cascade="all, delete-orphan")
invoices = relationship("Invoice", back_populates="booking", cascade="all, delete-orphan")
service_usages = relationship("ServiceUsage", back_populates="booking", cascade="all, delete-orphan")
checkin_checkout = relationship("CheckInCheckOut", back_populates="booking", uselist=False)

View File

@@ -0,0 +1,100 @@
from sqlalchemy import Column, Integer, String, DateTime, Numeric, Text, Enum, ForeignKey, Boolean
from sqlalchemy.orm import relationship
from datetime import datetime
import enum
from ..config.database import Base
class InvoiceStatus(str, enum.Enum):
draft = "draft"
sent = "sent"
paid = "paid"
overdue = "overdue"
cancelled = "cancelled"
class Invoice(Base):
__tablename__ = "invoices"
id = Column(Integer, primary_key=True, index=True, autoincrement=True)
invoice_number = Column(String(50), unique=True, nullable=False, index=True)
booking_id = Column(Integer, ForeignKey("bookings.id"), nullable=False)
user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
# Invoice details
issue_date = Column(DateTime, default=datetime.utcnow, nullable=False)
due_date = Column(DateTime, nullable=False)
paid_date = Column(DateTime, nullable=True)
# Amounts
subtotal = Column(Numeric(10, 2), nullable=False, default=0.00)
tax_rate = Column(Numeric(5, 2), nullable=False, default=0.00) # Tax percentage
tax_amount = Column(Numeric(10, 2), nullable=False, default=0.00)
discount_amount = Column(Numeric(10, 2), nullable=False, default=0.00)
total_amount = Column(Numeric(10, 2), nullable=False)
amount_paid = Column(Numeric(10, 2), nullable=False, default=0.00)
balance_due = Column(Numeric(10, 2), nullable=False)
# Status
status = Column(Enum(InvoiceStatus), nullable=False, default=InvoiceStatus.draft)
# Company/Organization information (for admin to manage)
company_name = Column(String(200), nullable=True)
company_address = Column(Text, nullable=True)
company_phone = Column(String(50), nullable=True)
company_email = Column(String(100), nullable=True)
company_tax_id = Column(String(100), nullable=True)
company_logo_url = Column(String(500), nullable=True)
# Customer information (snapshot at invoice creation)
customer_name = Column(String(200), nullable=False)
customer_email = Column(String(100), nullable=False)
customer_address = Column(Text, nullable=True)
customer_phone = Column(String(50), nullable=True)
customer_tax_id = Column(String(100), nullable=True)
# Additional information
notes = Column(Text, nullable=True)
terms_and_conditions = Column(Text, nullable=True)
payment_instructions = Column(Text, nullable=True)
# Metadata
created_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
# Relationships
booking = relationship("Booking", back_populates="invoices")
user = relationship("User", foreign_keys=[user_id], backref="invoices")
created_by = relationship("User", foreign_keys=[created_by_id])
updated_by = relationship("User", foreign_keys=[updated_by_id])
items = relationship("InvoiceItem", back_populates="invoice", cascade="all, delete-orphan")
class InvoiceItem(Base):
__tablename__ = "invoice_items"
id = Column(Integer, primary_key=True, index=True, autoincrement=True)
invoice_id = Column(Integer, ForeignKey("invoices.id"), nullable=False)
# Item details
description = Column(String(500), nullable=False)
quantity = Column(Numeric(10, 2), nullable=False, default=1.00)
unit_price = Column(Numeric(10, 2), nullable=False)
tax_rate = Column(Numeric(5, 2), nullable=False, default=0.00)
discount_amount = Column(Numeric(10, 2), nullable=False, default=0.00)
line_total = Column(Numeric(10, 2), nullable=False)
# Optional reference to booking items
room_id = Column(Integer, ForeignKey("rooms.id"), nullable=True)
service_id = Column(Integer, ForeignKey("services.id"), nullable=True)
# Metadata
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
# Relationships
invoice = relationship("Invoice", back_populates="items")
room = relationship("Room")
service = relationship("Service")

View File

@@ -11,6 +11,7 @@ class PaymentMethod(str, enum.Enum):
debit_card = "debit_card"
bank_transfer = "bank_transfer"
e_wallet = "e_wallet"
stripe = "stripe"
class PaymentType(str, enum.Enum):

View File

@@ -22,6 +22,9 @@ class Room(Base):
status = Column(Enum(RoomStatus), nullable=False, default=RoomStatus.available)
price = Column(Numeric(10, 2), nullable=False)
featured = Column(Boolean, nullable=False, default=False)
capacity = Column(Integer, nullable=True) # Room-specific capacity, overrides room_type capacity
room_size = Column(String(50), nullable=True) # e.g., "1 Room", "2 Rooms", "50 sqm"
view = Column(String(100), nullable=True) # e.g., "City View", "Ocean View", etc.
images = Column(JSON, nullable=True)
amenities = Column(JSON, nullable=True)
description = Column(Text, nullable=True)

View File

@@ -0,0 +1,21 @@
from sqlalchemy import Column, Integer, String, DateTime, Text, ForeignKey
from sqlalchemy.orm import relationship
from datetime import datetime
from ..config.database import Base
class SystemSettings(Base):
"""
System-wide settings controlled by administrators.
Stores key-value pairs for platform configuration like currency, etc.
"""
__tablename__ = "system_settings"
id = Column(Integer, primary_key=True, index=True, autoincrement=True)
key = Column(String(100), unique=True, nullable=False, index=True)
value = Column(Text, nullable=False)
description = Column(Text, nullable=True)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
updated_by_id = Column(Integer, ForeignKey("users.id"), nullable=True)
updated_by = relationship("User", lazy="joined")

View File

@@ -15,6 +15,7 @@ class User(Base):
phone = Column(String(20), nullable=True)
address = Column(Text, nullable=True)
avatar = Column(String(255), nullable=True)
currency = Column(String(3), nullable=False, default='VND') # ISO 4217 currency code
is_active = Column(Boolean, nullable=False, default=True)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)

View File

@@ -196,7 +196,8 @@ async def update_profile(
email=profile_data.get("email"),
phone_number=profile_data.get("phone_number"),
password=profile_data.get("password"),
current_password=profile_data.get("currentPassword")
current_password=profile_data.get("currentPassword"),
currency=profile_data.get("currency")
)
return {
"status": "success",

View File

@@ -1,5 +1,5 @@
from fastapi import APIRouter, Depends, HTTPException, status, Query
from sqlalchemy.orm import Session
from sqlalchemy.orm import Session, joinedload, selectinload
from sqlalchemy import and_, or_
from typing import Optional
from datetime import datetime
@@ -11,9 +11,11 @@ from ..config.settings import settings
from ..middleware.auth import get_current_user, authorize_roles
from ..models.user import User
from ..models.booking import Booking, BookingStatus
from ..models.room import Room
from ..models.room import Room, RoomStatus
from ..models.room_type import RoomType
from ..models.payment import Payment, PaymentMethod, PaymentType, PaymentStatus
from ..services.room_service import normalize_images, get_base_url
from fastapi import Request
from ..utils.mailer import send_email
from ..utils.email_templates import (
booking_confirmation_email_template,
@@ -129,6 +131,7 @@ async def get_all_bookings(
@router.get("/me")
async def get_my_bookings(
request: Request,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
@@ -138,6 +141,7 @@ async def get_my_bookings(
Booking.user_id == current_user.id
).order_by(Booking.created_at.desc()).all()
base_url = get_base_url(request)
result = []
for booking in bookings:
booking_dict = {
@@ -157,11 +161,25 @@ async def get_my_bookings(
# Add room info
if booking.room and booking.room.room_type:
# Normalize room images if they exist
room_images = []
if booking.room.images:
try:
room_images = normalize_images(booking.room.images, base_url)
except:
room_images = []
booking_dict["room"] = {
"id": booking.room.id,
"room_number": booking.room.room_number,
"floor": booking.room.floor,
"images": room_images, # Include room images
"room_type": {
"id": booking.room.room_type.id,
"name": booking.room.room_type.name,
"base_price": float(booking.room.room_type.base_price) if booking.room.room_type.base_price else 0.0,
"capacity": booking.room.room_type.capacity,
"images": room_images, # Also include in room_type for backwards compatibility
}
}
@@ -221,10 +239,17 @@ async def create_booking(
booking_number = generate_booking_number()
# Determine if deposit is required
# Cash requires deposit, Stripe doesn't require deposit (full payment or deposit handled via payment flow)
requires_deposit = payment_method == "cash"
deposit_percentage = 20 if requires_deposit else 0
deposit_amount = (float(total_price) * deposit_percentage) / 100 if requires_deposit else 0
# For Stripe, booking can be confirmed immediately after payment
initial_status = BookingStatus.pending
if payment_method == "stripe":
# Will be confirmed after successful Stripe payment
initial_status = BookingStatus.pending
# Create booking
booking = Booking(
booking_number=booking_number,
@@ -235,7 +260,7 @@ async def create_booking(
num_guests=guest_count,
total_price=total_price,
special_requests=notes,
status=BookingStatus.pending,
status=initial_status,
requires_deposit=requires_deposit,
deposit_paid=False,
)
@@ -243,24 +268,101 @@ async def create_booking(
db.add(booking)
db.flush()
# Create deposit payment if required
if requires_deposit:
# Create payment record if Stripe payment method is selected
if payment_method == "stripe":
from ..models.payment import Payment, PaymentMethod, PaymentStatus, PaymentType
payment = Payment(
booking_id=booking.id,
amount=deposit_amount,
payment_method=PaymentMethod.bank_transfer,
payment_type=PaymentType.deposit,
deposit_percentage=deposit_percentage,
amount=total_price,
payment_method=PaymentMethod.stripe,
payment_type=PaymentType.full,
payment_status=PaymentStatus.pending,
notes=f"Deposit payment ({deposit_percentage}%) for booking {booking_number}",
payment_date=None,
)
db.add(payment)
db.flush()
# Create deposit payment if required (for cash method)
# Note: For cash payments, deposit is paid on arrival, so we don't create a pending payment record
# The payment will be created when the customer pays at check-in
db.commit()
db.refresh(booking)
# Fetch with relations
booking = db.query(Booking).filter(Booking.id == booking.id).first()
# Fetch with relations for proper serialization (eager load payments)
from sqlalchemy.orm import joinedload
booking = db.query(Booking).options(joinedload(Booking.payments)).filter(Booking.id == booking.id).first()
# Determine payment_method and payment_status from payments
payment_method_from_payments = None
payment_status_from_payments = "unpaid"
if booking.payments:
latest_payment = sorted(booking.payments, key=lambda p: p.created_at, reverse=True)[0]
payment_method_from_payments = latest_payment.payment_method.value if isinstance(latest_payment.payment_method, PaymentMethod) else latest_payment.payment_method
if latest_payment.payment_status == PaymentStatus.completed:
payment_status_from_payments = "paid"
elif latest_payment.payment_status == PaymentStatus.refunded:
payment_status_from_payments = "refunded"
# Serialize booking properly
booking_dict = {
"id": booking.id,
"booking_number": booking.booking_number,
"user_id": booking.user_id,
"room_id": booking.room_id,
"check_in_date": booking.check_in_date.isoformat() if booking.check_in_date else None,
"check_out_date": booking.check_out_date.isoformat() if booking.check_out_date else None,
"guest_count": booking.num_guests,
"total_price": float(booking.total_price) if booking.total_price else 0.0,
"status": booking.status.value if isinstance(booking.status, BookingStatus) else booking.status,
"payment_method": payment_method_from_payments or payment_method,
"payment_status": payment_status_from_payments,
"deposit_paid": booking.deposit_paid,
"requires_deposit": booking.requires_deposit,
"notes": booking.special_requests,
"guest_info": {
"full_name": current_user.full_name,
"email": current_user.email,
"phone": current_user.phone_number if hasattr(current_user, 'phone_number') else (current_user.phone if hasattr(current_user, 'phone') else ""),
},
"createdAt": booking.created_at.isoformat() if booking.created_at else None,
"updatedAt": booking.updated_at.isoformat() if booking.updated_at else None,
"created_at": booking.created_at.isoformat() if booking.created_at else None,
}
# Add payments if they exist
if booking.payments:
booking_dict["payments"] = [
{
"id": p.id,
"booking_id": p.booking_id,
"amount": float(p.amount) if p.amount else 0.0,
"payment_method": p.payment_method.value if isinstance(p.payment_method, PaymentMethod) else p.payment_method,
"payment_type": p.payment_type.value if isinstance(p.payment_type, PaymentType) else p.payment_type,
"deposit_percentage": p.deposit_percentage,
"payment_status": p.payment_status.value if isinstance(p.payment_status, PaymentStatus) else p.payment_status,
"transaction_id": p.transaction_id,
"payment_date": p.payment_date.isoformat() if p.payment_date else None,
"notes": p.notes,
"created_at": p.created_at.isoformat() if p.created_at else None,
}
for p in booking.payments
]
# Add room info if available
if booking.room:
booking_dict["room"] = {
"id": booking.room.id,
"room_number": booking.room.room_number,
"floor": booking.room.floor,
}
if booking.room.room_type:
booking_dict["room"]["room_type"] = {
"id": booking.room.room_type.id,
"name": booking.room.room_type.name,
"base_price": float(booking.room.room_type.base_price) if booking.room.room_type.base_price else 0.0,
"capacity": booking.room.room_type.capacity,
}
# Send booking confirmation email (non-blocking)
try:
@@ -291,7 +393,7 @@ async def create_booking(
return {
"success": True,
"data": {"booking": booking},
"data": {"booking": booking_dict},
"message": f"Booking created. Please pay {deposit_percentage}% deposit to confirm." if requires_deposit else "Booking created successfully"
}
except HTTPException:
@@ -304,12 +406,22 @@ async def create_booking(
@router.get("/{id}")
async def get_booking_by_id(
id: int,
request: Request,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Get booking by ID"""
try:
booking = db.query(Booking).filter(Booking.id == id).first()
# Eager load all relationships to avoid N+1 queries
# Using selectinload for better performance with multiple relationships
booking = db.query(Booking)\
.options(
selectinload(Booking.payments),
joinedload(Booking.user),
joinedload(Booking.room).joinedload(Room.room_type)
)\
.filter(Booking.id == id)\
.first()
if not booking:
raise HTTPException(status_code=404, detail="Booking not found")
@@ -318,6 +430,19 @@ async def get_booking_by_id(
if current_user.role_id != 1 and booking.user_id != current_user.id: # Not admin
raise HTTPException(status_code=403, detail="Forbidden")
# Determine payment_method and payment_status from payments
# Get latest payment efficiently (already loaded via joinedload)
payment_method = None
payment_status = "unpaid"
if booking.payments:
# Find latest payment (payments are already loaded, so this is fast)
latest_payment = max(booking.payments, key=lambda p: p.created_at if p.created_at else datetime.min)
payment_method = latest_payment.payment_method.value if isinstance(latest_payment.payment_method, PaymentMethod) else latest_payment.payment_method
if latest_payment.payment_status == PaymentStatus.completed:
payment_status = "paid"
elif latest_payment.payment_status == PaymentStatus.refunded:
payment_status = "refunded"
booking_dict = {
"id": booking.id,
"booking_number": booking.booking_number,
@@ -325,24 +450,56 @@ async def get_booking_by_id(
"room_id": booking.room_id,
"check_in_date": booking.check_in_date.isoformat() if booking.check_in_date else None,
"check_out_date": booking.check_out_date.isoformat() if booking.check_out_date else None,
"num_guests": booking.num_guests,
"guest_count": booking.num_guests, # Frontend expects guest_count
"total_price": float(booking.total_price) if booking.total_price else 0.0,
"status": booking.status.value if isinstance(booking.status, BookingStatus) else booking.status,
"payment_method": payment_method or "cash",
"payment_status": payment_status,
"deposit_paid": booking.deposit_paid,
"requires_deposit": booking.requires_deposit,
"special_requests": booking.special_requests,
"notes": booking.special_requests, # Frontend expects notes
"guest_info": {
"full_name": booking.user.full_name if booking.user else "",
"email": booking.user.email if booking.user else "",
"phone": booking.user.phone_number if booking.user and hasattr(booking.user, 'phone_number') else (booking.user.phone if booking.user and hasattr(booking.user, 'phone') else ""),
} if booking.user else None,
"createdAt": booking.created_at.isoformat() if booking.created_at else None,
"updatedAt": booking.updated_at.isoformat() if booking.updated_at else None,
"created_at": booking.created_at.isoformat() if booking.created_at else None,
}
# Add relations
# Only get base_url if we need it (room has images)
if booking.room and booking.room.images:
base_url = get_base_url(request)
# Normalize room images if they exist
try:
room_images = normalize_images(booking.room.images, base_url)
except:
room_images = []
else:
room_images = []
if booking.room:
booking_dict["room"] = {
"id": booking.room.id,
"room_number": booking.room.room_number,
"floor": booking.room.floor,
"status": booking.room.status.value if isinstance(booking.room.status, RoomStatus) else booking.room.status,
"images": room_images, # Include room images directly on room object
}
if booking.room.room_type:
# Use room images if room_type doesn't have images (which is typical)
# RoomType doesn't have images column, images are stored on Room
room_type_images = room_images if room_images else []
booking_dict["room"]["room_type"] = {
"id": booking.room.room_type.id,
"name": booking.room.room_type.name,
"base_price": float(booking.room.room_type.base_price) if booking.room.room_type.base_price else 0.0,
"capacity": booking.room.room_type.capacity,
"images": room_type_images,
}
if booking.payments:
@@ -385,6 +542,20 @@ async def cancel_booking(
if booking.status == BookingStatus.cancelled:
raise HTTPException(status_code=400, detail="Booking already cancelled")
# Prevent cancellation of confirmed bookings
if booking.status == BookingStatus.confirmed:
raise HTTPException(
status_code=400,
detail="Cannot cancel a confirmed booking. Please contact support for assistance."
)
# Only allow cancellation of pending bookings
if booking.status != BookingStatus.pending:
raise HTTPException(
status_code=400,
detail=f"Cannot cancel booking with status: {booking.status.value}. Only pending bookings can be cancelled."
)
booking.status = BookingStatus.cancelled
db.commit()

View File

@@ -0,0 +1,249 @@
from fastapi import APIRouter, Depends, HTTPException, status, Query
from sqlalchemy.orm import Session
from typing import Optional
from datetime import datetime
from ..config.database import get_db
from ..middleware.auth import get_current_user, authorize_roles
from ..models.user import User
from ..models.invoice import Invoice, InvoiceStatus
from ..models.booking import Booking
from ..services.invoice_service import InvoiceService
router = APIRouter(prefix="/invoices", tags=["invoices"])
@router.get("/")
async def get_invoices(
booking_id: Optional[int] = Query(None),
status_filter: Optional[str] = Query(None, alias="status"),
page: int = Query(1, ge=1),
limit: int = Query(10, ge=1, le=100),
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Get invoices for current user (or all invoices for admin)"""
try:
# Admin can see all invoices, users can only see their own
user_id = None if current_user.role_id == 1 else current_user.id
result = InvoiceService.get_invoices(
db=db,
user_id=user_id,
booking_id=booking_id,
status=status_filter,
page=page,
limit=limit
)
return {
"status": "success",
"data": result
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/{id}")
async def get_invoice_by_id(
id: int,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Get invoice by ID"""
try:
invoice = InvoiceService.get_invoice(id, db)
if not invoice:
raise HTTPException(status_code=404, detail="Invoice not found")
# Check access: admin can see all, users can only see their own
if current_user.role_id != 1 and invoice["user_id"] != current_user.id:
raise HTTPException(status_code=403, detail="Forbidden")
return {
"status": "success",
"data": {"invoice": invoice}
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/")
async def create_invoice(
invoice_data: dict,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Create a new invoice from a booking (Admin/Staff only)"""
try:
# Only admin/staff can create invoices
if current_user.role_id not in [1, 2]:
raise HTTPException(status_code=403, detail="Forbidden")
booking_id = invoice_data.get("booking_id")
if not booking_id:
raise HTTPException(status_code=400, detail="booking_id is required")
# Check if booking exists
booking = db.query(Booking).filter(Booking.id == booking_id).first()
if not booking:
raise HTTPException(status_code=404, detail="Booking not found")
# Create invoice
invoice = InvoiceService.create_invoice_from_booking(
booking_id=booking_id,
db=db,
created_by_id=current_user.id,
tax_rate=invoice_data.get("tax_rate", 0.0),
discount_amount=invoice_data.get("discount_amount", 0.0),
due_days=invoice_data.get("due_days", 30),
company_name=invoice_data.get("company_name"),
company_address=invoice_data.get("company_address"),
company_phone=invoice_data.get("company_phone"),
company_email=invoice_data.get("company_email"),
company_tax_id=invoice_data.get("company_tax_id"),
company_logo_url=invoice_data.get("company_logo_url"),
customer_tax_id=invoice_data.get("customer_tax_id"),
notes=invoice_data.get("notes"),
terms_and_conditions=invoice_data.get("terms_and_conditions"),
payment_instructions=invoice_data.get("payment_instructions"),
)
return {
"status": "success",
"message": "Invoice created successfully",
"data": {"invoice": invoice}
}
except HTTPException:
raise
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.put("/{id}")
async def update_invoice(
id: int,
invoice_data: dict,
current_user: User = Depends(authorize_roles("admin", "staff")),
db: Session = Depends(get_db)
):
"""Update an invoice (Admin/Staff only)"""
try:
invoice = db.query(Invoice).filter(Invoice.id == id).first()
if not invoice:
raise HTTPException(status_code=404, detail="Invoice not found")
# Update invoice
updated_invoice = InvoiceService.update_invoice(
invoice_id=id,
db=db,
updated_by_id=current_user.id,
**invoice_data
)
return {
"status": "success",
"message": "Invoice updated successfully",
"data": {"invoice": updated_invoice}
}
except HTTPException:
raise
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/{id}/mark-paid")
async def mark_invoice_as_paid(
id: int,
payment_data: dict,
current_user: User = Depends(authorize_roles("admin", "staff")),
db: Session = Depends(get_db)
):
"""Mark an invoice as paid (Admin/Staff only)"""
try:
amount = payment_data.get("amount")
updated_invoice = InvoiceService.mark_invoice_as_paid(
invoice_id=id,
db=db,
amount=amount,
updated_by_id=current_user.id
)
return {
"status": "success",
"message": "Invoice marked as paid successfully",
"data": {"invoice": updated_invoice}
}
except HTTPException:
raise
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/{id}")
async def delete_invoice(
id: int,
current_user: User = Depends(authorize_roles("admin")),
db: Session = Depends(get_db)
):
"""Delete an invoice (Admin only)"""
try:
invoice = db.query(Invoice).filter(Invoice.id == id).first()
if not invoice:
raise HTTPException(status_code=404, detail="Invoice not found")
db.delete(invoice)
db.commit()
return {
"status": "success",
"message": "Invoice deleted successfully"
}
except HTTPException:
raise
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=str(e))
@router.get("/booking/{booking_id}")
async def get_invoices_by_booking(
booking_id: int,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Get all invoices for a specific booking"""
try:
# Check if booking exists and user has access
booking = db.query(Booking).filter(Booking.id == booking_id).first()
if not booking:
raise HTTPException(status_code=404, detail="Booking not found")
# Check access: admin can see all, users can only see their own bookings
if current_user.role_id != 1 and booking.user_id != current_user.id:
raise HTTPException(status_code=403, detail="Forbidden")
result = InvoiceService.get_invoices(
db=db,
booking_id=booking_id
)
return {
"status": "success",
"data": result
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -1,4 +1,4 @@
from fastapi import APIRouter, Depends, HTTPException, status, Query
from fastapi import APIRouter, Depends, HTTPException, status, Query, Request, Header
from sqlalchemy.orm import Session
from typing import Optional
from datetime import datetime
@@ -12,6 +12,7 @@ from ..models.payment import Payment, PaymentMethod, PaymentType, PaymentStatus
from ..models.booking import Booking, BookingStatus
from ..utils.mailer import send_email
from ..utils.email_templates import payment_confirmation_email_template
from ..services.stripe_service import StripeService
router = APIRouter(prefix="/payments", tags=["payments"])
@@ -340,3 +341,250 @@ async def update_payment_status(
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=str(e))
@router.post("/stripe/create-intent")
async def create_stripe_payment_intent(
intent_data: dict,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Create a Stripe payment intent"""
try:
# Check if Stripe is configured (from database or environment)
from ..services.stripe_service import get_stripe_secret_key
secret_key = get_stripe_secret_key(db)
if not secret_key:
secret_key = settings.STRIPE_SECRET_KEY
if not secret_key:
raise HTTPException(
status_code=500,
detail="Stripe is not configured. Please configure Stripe settings in Admin Panel or set STRIPE_SECRET_KEY environment variable."
)
booking_id = intent_data.get("booking_id")
amount = float(intent_data.get("amount", 0))
currency = intent_data.get("currency", "usd")
# Log the incoming amount for debugging
import logging
logger = logging.getLogger(__name__)
logger.info(f"Creating Stripe payment intent - Booking ID: {booking_id}, Amount: ${amount:,.2f}, Currency: {currency}")
if not booking_id or amount <= 0:
raise HTTPException(
status_code=400,
detail="booking_id and amount are required"
)
# Validate amount is reasonable (Stripe max is $999,999.99)
if amount > 999999.99:
logger.error(f"Amount ${amount:,.2f} exceeds Stripe's maximum of $999,999.99")
raise HTTPException(
status_code=400,
detail=f"Amount ${amount:,.2f} exceeds Stripe's maximum of $999,999.99. Please contact support for large payments."
)
# Verify booking exists and user has access
booking = db.query(Booking).filter(Booking.id == booking_id).first()
if not booking:
raise HTTPException(status_code=404, detail="Booking not found")
if current_user.role_id != 1 and booking.user_id != current_user.id:
raise HTTPException(status_code=403, detail="Forbidden")
# Create payment intent
intent = StripeService.create_payment_intent(
amount=amount,
currency=currency,
metadata={
"booking_id": str(booking_id),
"booking_number": booking.booking_number,
"user_id": str(current_user.id),
},
db=db
)
# Get publishable key from database or environment
from ..services.stripe_service import get_stripe_publishable_key
publishable_key = get_stripe_publishable_key(db)
if not publishable_key:
publishable_key = settings.STRIPE_PUBLISHABLE_KEY
if not publishable_key:
import logging
logger = logging.getLogger(__name__)
logger.warning("Stripe publishable key is not configured")
raise HTTPException(
status_code=500,
detail="Stripe publishable key is not configured. Please configure it in Admin Panel (Settings > Stripe Settings) or set STRIPE_PUBLISHABLE_KEY environment variable."
)
if not intent.get("client_secret"):
import logging
logger = logging.getLogger(__name__)
logger.error("Payment intent created but client_secret is missing")
raise HTTPException(
status_code=500,
detail="Failed to create payment intent. Client secret is missing."
)
return {
"status": "success",
"message": "Payment intent created successfully",
"data": {
"client_secret": intent["client_secret"],
"payment_intent_id": intent["id"],
"publishable_key": publishable_key,
}
}
except HTTPException:
raise
except ValueError as e:
import logging
logger = logging.getLogger(__name__)
logger.error(f"Payment intent creation error: {str(e)}")
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
import logging
logger = logging.getLogger(__name__)
logger.error(f"Unexpected error creating payment intent: {str(e)}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
@router.post("/stripe/confirm")
async def confirm_stripe_payment(
payment_data: dict,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Confirm a Stripe payment"""
try:
payment_intent_id = payment_data.get("payment_intent_id")
booking_id = payment_data.get("booking_id")
if not payment_intent_id:
raise HTTPException(
status_code=400,
detail="payment_intent_id is required"
)
# Confirm payment (this commits the transaction internally)
payment = StripeService.confirm_payment(
payment_intent_id=payment_intent_id,
db=db,
booking_id=booking_id
)
# Ensure the transaction is committed before proceeding
# The service method already commits, but we ensure it here too
try:
db.commit()
except Exception:
# If already committed, this will raise an error, which we can ignore
pass
# Get fresh booking from database to get updated status (after commit)
booking = db.query(Booking).filter(Booking.id == payment["booking_id"]).first()
if booking:
db.refresh(booking)
# Send payment confirmation email (non-blocking, after commit)
# This won't affect the transaction since it's already committed
if booking and booking.user:
try:
client_url = settings.CLIENT_URL or os.getenv("CLIENT_URL", "http://localhost:5173")
email_html = payment_confirmation_email_template(
booking_number=booking.booking_number,
guest_name=booking.user.full_name,
amount=payment["amount"],
payment_method="stripe",
transaction_id=payment["transaction_id"],
client_url=client_url
)
await send_email(
to=booking.user.email,
subject=f"Payment Confirmed - {booking.booking_number}",
html=email_html
)
except Exception as e:
import logging
logger = logging.getLogger(__name__)
logger.warning(f"Failed to send payment confirmation email: {e}")
return {
"status": "success",
"message": "Payment confirmed successfully",
"data": {
"payment": payment,
"booking": {
"id": booking.id if booking else None,
"booking_number": booking.booking_number if booking else None,
"status": booking.status.value if booking else None,
}
}
}
except HTTPException:
db.rollback()
raise
except ValueError as e:
import logging
logger = logging.getLogger(__name__)
logger.error(f"Payment confirmation error: {str(e)}")
db.rollback()
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
import logging
logger = logging.getLogger(__name__)
logger.error(f"Unexpected error confirming payment: {str(e)}", exc_info=True)
db.rollback()
raise HTTPException(status_code=500, detail=str(e))
@router.post("/stripe/webhook")
async def stripe_webhook(
request: Request,
db: Session = Depends(get_db)
):
"""Handle Stripe webhook events"""
try:
# Check if webhook secret is configured (from database or environment)
from ..services.stripe_service import get_stripe_webhook_secret
webhook_secret = get_stripe_webhook_secret(db)
if not webhook_secret:
webhook_secret = settings.STRIPE_WEBHOOK_SECRET
if not webhook_secret:
raise HTTPException(
status_code=503,
detail={
"status": "error",
"message": "Stripe webhook secret is not configured. Please configure it in Admin Panel (Settings > Stripe Settings) or set STRIPE_WEBHOOK_SECRET environment variable."
}
)
payload = await request.body()
signature = request.headers.get("stripe-signature")
if not signature:
raise HTTPException(
status_code=400,
detail="Missing stripe-signature header"
)
result = StripeService.handle_webhook(
payload=payload,
signature=signature,
db=db
)
return {
"status": "success",
"data": result
}
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -197,7 +197,7 @@ async def search_available_rooms(
raise HTTPException(status_code=500, detail=str(e))
@router.get("/{id}")
@router.get("/id/{id}")
async def get_room_by_id(id: int, request: Request, db: Session = Depends(get_db)):
"""Get room by ID"""
try:
@@ -225,9 +225,81 @@ async def get_room_by_id(id: int, request: Request, db: Session = Depends(get_db
"room_number": room.room_number,
"floor": room.floor,
"status": room.status.value if isinstance(room.status, RoomStatus) else room.status,
"price": float(room.price) if room.price else 0.0,
"price": float(room.price) if room.price is not None and room.price > 0 else None,
"featured": room.featured,
"description": room.description,
"capacity": room.capacity,
"room_size": room.room_size,
"view": room.view,
"amenities": room.amenities,
"created_at": room.created_at.isoformat() if room.created_at else None,
"updated_at": room.updated_at.isoformat() if room.updated_at else None,
"average_rating": round(float(review_stats.average_rating or 0), 1) if review_stats and review_stats.average_rating else None,
"total_reviews": review_stats.total_reviews or 0 if review_stats else 0,
}
# Normalize images
try:
room_dict["images"] = normalize_images(room.images, base_url)
except:
room_dict["images"] = []
# Add room type
if room.room_type:
room_dict["room_type"] = {
"id": room.room_type.id,
"name": room.room_type.name,
"description": room.room_type.description,
"base_price": float(room.room_type.base_price) if room.room_type.base_price else 0.0,
"capacity": room.room_type.capacity,
"amenities": room.room_type.amenities,
"images": [] # RoomType doesn't have images column in DB
}
return {
"status": "success",
"data": {"room": room_dict}
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/{room_number}")
async def get_room_by_number(room_number: str, request: Request, db: Session = Depends(get_db)):
"""Get room by room number"""
try:
room = db.query(Room).filter(Room.room_number == room_number).first()
if not room:
raise HTTPException(status_code=404, detail="Room not found")
# Get review stats
review_stats = db.query(
func.avg(Review.rating).label('average_rating'),
func.count(Review.id).label('total_reviews')
).filter(
and_(
Review.room_id == room.id,
Review.status == ReviewStatus.approved
)
).first()
base_url = get_base_url(request)
room_dict = {
"id": room.id,
"room_type_id": room.room_type_id,
"room_number": room.room_number,
"floor": room.floor,
"status": room.status.value if isinstance(room.status, RoomStatus) else room.status,
"price": float(room.price) if room.price is not None and room.price > 0 else None,
"featured": room.featured,
"description": room.description,
"capacity": room.capacity,
"room_size": room.room_size,
"view": room.view,
"amenities": room.amenities,
"created_at": room.created_at.isoformat() if room.created_at else None,
"updated_at": room.updated_at.isoformat() if room.updated_at else None,
@@ -266,6 +338,7 @@ async def get_room_by_id(id: int, request: Request, db: Session = Depends(get_db
@router.post("/", dependencies=[Depends(authorize_roles("admin"))])
async def create_room(
room_data: dict,
request: Request,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
@@ -281,6 +354,13 @@ async def create_room(
if existing:
raise HTTPException(status_code=400, detail="Room number already exists")
# Ensure amenities is always a list
amenities_value = room_data.get("amenities", [])
if amenities_value is None:
amenities_value = []
elif not isinstance(amenities_value, list):
amenities_value = []
room = Room(
room_type_id=room_data.get("room_type_id"),
room_number=room_data.get("room_number"),
@@ -288,16 +368,60 @@ async def create_room(
status=RoomStatus(room_data.get("status", "available")),
featured=room_data.get("featured", False),
price=room_data.get("price", room_type.base_price),
description=room_data.get("description"),
capacity=room_data.get("capacity"),
room_size=room_data.get("room_size"),
view=room_data.get("view"),
amenities=amenities_value,
)
db.add(room)
db.commit()
db.refresh(room)
# Get base URL for proper response
base_url = get_base_url(request)
# Serialize room data
room_dict = {
"id": room.id,
"room_type_id": room.room_type_id,
"room_number": room.room_number,
"floor": room.floor,
"status": room.status.value if isinstance(room.status, RoomStatus) else room.status,
"price": float(room.price) if room.price is not None and room.price > 0 else None,
"featured": room.featured,
"description": room.description,
"capacity": room.capacity,
"room_size": room.room_size,
"view": room.view,
"amenities": room.amenities if room.amenities else [],
"created_at": room.created_at.isoformat() if room.created_at else None,
"updated_at": room.updated_at.isoformat() if room.updated_at else None,
}
# Normalize images
try:
room_dict["images"] = normalize_images(room.images, base_url)
except:
room_dict["images"] = []
# Add room type info
if room.room_type:
room_dict["room_type"] = {
"id": room.room_type.id,
"name": room.room_type.name,
"description": room.room_type.description,
"base_price": float(room.room_type.base_price) if room.room_type.base_price else 0.0,
"capacity": room.room_type.capacity,
"amenities": room.room_type.amenities if room.room_type.amenities else [],
"images": []
}
return {
"status": "success",
"message": "Room created successfully",
"data": {"room": room}
"data": {"room": room_dict}
}
except HTTPException:
raise
@@ -310,6 +434,7 @@ async def create_room(
async def update_room(
id: int,
room_data: dict,
request: Request,
current_user: User = Depends(authorize_roles("admin")),
db: Session = Depends(get_db)
):
@@ -337,14 +462,70 @@ async def update_room(
room.featured = room_data["featured"]
if "price" in room_data:
room.price = room_data["price"]
if "description" in room_data:
room.description = room_data["description"]
if "capacity" in room_data:
room.capacity = room_data["capacity"]
if "room_size" in room_data:
room.room_size = room_data["room_size"]
if "view" in room_data:
room.view = room_data["view"]
if "amenities" in room_data:
# Ensure amenities is always a list
amenities_value = room_data["amenities"]
if amenities_value is None:
room.amenities = []
elif isinstance(amenities_value, list):
room.amenities = amenities_value
else:
room.amenities = []
db.commit()
db.refresh(room)
# Get base URL for proper response
base_url = get_base_url(request)
# Serialize room data similar to get_room_by_id
room_dict = {
"id": room.id,
"room_type_id": room.room_type_id,
"room_number": room.room_number,
"floor": room.floor,
"status": room.status.value if isinstance(room.status, RoomStatus) else room.status,
"price": float(room.price) if room.price is not None and room.price > 0 else None,
"featured": room.featured,
"description": room.description,
"capacity": room.capacity,
"room_size": room.room_size,
"view": room.view,
"amenities": room.amenities if room.amenities else [],
"created_at": room.created_at.isoformat() if room.created_at else None,
"updated_at": room.updated_at.isoformat() if room.updated_at else None,
}
# Normalize images
try:
room_dict["images"] = normalize_images(room.images, base_url)
except:
room_dict["images"] = []
# Add room type info
if room.room_type:
room_dict["room_type"] = {
"id": room.room_type.id,
"name": room.room_type.name,
"description": room.room_type.description,
"base_price": float(room.room_type.base_price) if room.room_type.base_price else 0.0,
"capacity": room.room_type.capacity,
"amenities": room.room_type.amenities if room.room_type.amenities else [],
"images": []
}
return {
"status": "success",
"message": "Room updated successfully",
"data": {"room": room}
"data": {"room": room_dict}
}
except HTTPException:
raise
@@ -379,6 +560,57 @@ async def delete_room(
raise HTTPException(status_code=500, detail=str(e))
@router.post("/bulk-delete", dependencies=[Depends(authorize_roles("admin"))])
async def bulk_delete_rooms(
room_ids: dict,
current_user: User = Depends(authorize_roles("admin")),
db: Session = Depends(get_db)
):
"""Bulk delete rooms (Admin only)"""
try:
ids = room_ids.get("ids", [])
if not ids or not isinstance(ids, list):
raise HTTPException(status_code=400, detail="Invalid room IDs provided")
if len(ids) == 0:
raise HTTPException(status_code=400, detail="No room IDs provided")
# Validate all IDs are integers
try:
ids = [int(id) for id in ids]
except (ValueError, TypeError):
raise HTTPException(status_code=400, detail="All room IDs must be integers")
# Check if all rooms exist
rooms = db.query(Room).filter(Room.id.in_(ids)).all()
found_ids = [room.id for room in rooms]
not_found_ids = [id for id in ids if id not in found_ids]
if not_found_ids:
raise HTTPException(
status_code=404,
detail=f"Rooms with IDs {not_found_ids} not found"
)
# Delete all rooms
deleted_count = db.query(Room).filter(Room.id.in_(ids)).delete(synchronize_session=False)
db.commit()
return {
"status": "success",
"message": f"Successfully deleted {deleted_count} room(s)",
"data": {
"deleted_count": deleted_count,
"deleted_ids": ids
}
}
except HTTPException:
raise
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=str(e))
@router.post("/{id}/images", dependencies=[Depends(authorize_roles("admin", "staff"))])
async def upload_room_images(
id: int,
@@ -435,7 +667,7 @@ async def upload_room_images(
@router.delete("/{id}/images", dependencies=[Depends(authorize_roles("admin", "staff"))])
async def delete_room_images(
id: int,
image_url: str,
image_url: str = Query(..., description="Image URL or path to delete"),
current_user: User = Depends(authorize_roles("admin", "staff")),
db: Session = Depends(get_db)
):
@@ -445,12 +677,39 @@ async def delete_room_images(
if not room:
raise HTTPException(status_code=404, detail="Room not found")
# Update room images (images are stored on Room, not RoomType)
# Normalize the incoming image_url to extract the path
# Handle both full URLs and relative paths
normalized_url = image_url
if image_url.startswith('http://') or image_url.startswith('https://'):
# Extract path from URL
from urllib.parse import urlparse
parsed = urlparse(image_url)
normalized_url = parsed.path
# Normalize paths for comparison (ensure leading slash)
if not normalized_url.startswith('/'):
normalized_url = f"/{normalized_url}"
# Get filename from normalized path
filename = Path(normalized_url).name
# Update room images - compare by filename or full path
existing_images = room.images or []
updated_images = [img for img in existing_images if img != image_url]
updated_images = []
for img in existing_images:
# Normalize stored image path
stored_path = img if img.startswith('/') else f"/{img}"
stored_filename = Path(stored_path).name
# Compare by filename or full path
# Keep images that don't match
if (img != normalized_url and
stored_path != normalized_url and
stored_filename != filename):
updated_images.append(img)
# Delete file from disk
filename = Path(image_url).name
file_path = Path(__file__).parent.parent.parent / "uploads" / "rooms" / filename
if file_path.exists():
file_path.unlink()

View File

@@ -0,0 +1,302 @@
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy.orm import Session
from typing import Optional
from ..config.database import get_db
from ..middleware.auth import get_current_user, authorize_roles
from ..models.user import User
from ..models.system_settings import SystemSettings
router = APIRouter(prefix="/admin/system-settings", tags=["admin-system-settings"])
@router.get("/currency")
async def get_platform_currency(
db: Session = Depends(get_db)
):
"""Get platform currency setting (public endpoint for frontend)"""
try:
setting = db.query(SystemSettings).filter(
SystemSettings.key == "platform_currency"
).first()
if not setting:
# Default to VND if not set
return {
"status": "success",
"data": {
"currency": "VND",
"updated_at": None,
"updated_by": None
}
}
return {
"status": "success",
"data": {
"currency": setting.value,
"updated_at": setting.updated_at.isoformat() if setting.updated_at else None,
"updated_by": setting.updated_by.full_name if setting.updated_by else None
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.put("/currency")
async def update_platform_currency(
currency_data: dict,
current_user: User = Depends(authorize_roles("admin")),
db: Session = Depends(get_db)
):
"""Update platform currency (Admin only)"""
try:
currency = currency_data.get("currency", "").upper()
# Validate currency code
if not currency or len(currency) != 3 or not currency.isalpha():
raise HTTPException(
status_code=400,
detail="Invalid currency code. Must be a 3-letter ISO 4217 code (e.g., USD, EUR, VND)"
)
# Get or create setting
setting = db.query(SystemSettings).filter(
SystemSettings.key == "platform_currency"
).first()
if setting:
setting.value = currency
setting.updated_by_id = current_user.id
else:
setting = SystemSettings(
key="platform_currency",
value=currency,
description="Platform-wide currency setting for displaying prices across the application",
updated_by_id=current_user.id
)
db.add(setting)
db.commit()
db.refresh(setting)
return {
"status": "success",
"message": "Platform currency updated successfully",
"data": {
"currency": setting.value,
"updated_at": setting.updated_at.isoformat() if setting.updated_at else None,
"updated_by": setting.updated_by.full_name if setting.updated_by else None
}
}
except HTTPException:
raise
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=str(e))
@router.get("/")
async def get_all_settings(
current_user: User = Depends(authorize_roles("admin")),
db: Session = Depends(get_db)
):
"""Get all system settings (Admin only)"""
try:
settings = db.query(SystemSettings).all()
result = []
for setting in settings:
result.append({
"key": setting.key,
"value": setting.value,
"description": setting.description,
"updated_at": setting.updated_at.isoformat() if setting.updated_at else None,
"updated_by": setting.updated_by.full_name if setting.updated_by else None
})
return {
"status": "success",
"data": {
"settings": result
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.get("/stripe")
async def get_stripe_settings(
current_user: User = Depends(authorize_roles("admin")),
db: Session = Depends(get_db)
):
"""Get Stripe payment settings (Admin only)"""
try:
secret_key_setting = db.query(SystemSettings).filter(
SystemSettings.key == "stripe_secret_key"
).first()
publishable_key_setting = db.query(SystemSettings).filter(
SystemSettings.key == "stripe_publishable_key"
).first()
webhook_secret_setting = db.query(SystemSettings).filter(
SystemSettings.key == "stripe_webhook_secret"
).first()
# Mask secret keys for security (only show last 4 characters)
def mask_key(key_value: str) -> str:
if not key_value or len(key_value) < 4:
return ""
return "*" * (len(key_value) - 4) + key_value[-4:]
result = {
"stripe_secret_key": "",
"stripe_publishable_key": "",
"stripe_webhook_secret": "",
"stripe_secret_key_masked": "",
"stripe_webhook_secret_masked": "",
"has_secret_key": False,
"has_publishable_key": False,
"has_webhook_secret": False,
}
if secret_key_setting:
result["stripe_secret_key"] = secret_key_setting.value
result["stripe_secret_key_masked"] = mask_key(secret_key_setting.value)
result["has_secret_key"] = bool(secret_key_setting.value)
result["updated_at"] = secret_key_setting.updated_at.isoformat() if secret_key_setting.updated_at else None
result["updated_by"] = secret_key_setting.updated_by.full_name if secret_key_setting.updated_by else None
if publishable_key_setting:
result["stripe_publishable_key"] = publishable_key_setting.value
result["has_publishable_key"] = bool(publishable_key_setting.value)
if webhook_secret_setting:
result["stripe_webhook_secret"] = webhook_secret_setting.value
result["stripe_webhook_secret_masked"] = mask_key(webhook_secret_setting.value)
result["has_webhook_secret"] = bool(webhook_secret_setting.value)
return {
"status": "success",
"data": result
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.put("/stripe")
async def update_stripe_settings(
stripe_data: dict,
current_user: User = Depends(authorize_roles("admin")),
db: Session = Depends(get_db)
):
"""Update Stripe payment settings (Admin only)"""
try:
secret_key = stripe_data.get("stripe_secret_key", "").strip()
publishable_key = stripe_data.get("stripe_publishable_key", "").strip()
webhook_secret = stripe_data.get("stripe_webhook_secret", "").strip()
# Validate secret key format (should start with sk_)
if secret_key and not secret_key.startswith("sk_"):
raise HTTPException(
status_code=400,
detail="Invalid Stripe secret key format. Must start with 'sk_'"
)
# Validate publishable key format (should start with pk_)
if publishable_key and not publishable_key.startswith("pk_"):
raise HTTPException(
status_code=400,
detail="Invalid Stripe publishable key format. Must start with 'pk_'"
)
# Validate webhook secret format (should start with whsec_)
if webhook_secret and not webhook_secret.startswith("whsec_"):
raise HTTPException(
status_code=400,
detail="Invalid Stripe webhook secret format. Must start with 'whsec_'"
)
# Update or create secret key setting
if secret_key:
setting = db.query(SystemSettings).filter(
SystemSettings.key == "stripe_secret_key"
).first()
if setting:
setting.value = secret_key
setting.updated_by_id = current_user.id
else:
setting = SystemSettings(
key="stripe_secret_key",
value=secret_key,
description="Stripe secret key for processing payments",
updated_by_id=current_user.id
)
db.add(setting)
# Update or create publishable key setting
if publishable_key:
setting = db.query(SystemSettings).filter(
SystemSettings.key == "stripe_publishable_key"
).first()
if setting:
setting.value = publishable_key
setting.updated_by_id = current_user.id
else:
setting = SystemSettings(
key="stripe_publishable_key",
value=publishable_key,
description="Stripe publishable key for frontend payment forms",
updated_by_id=current_user.id
)
db.add(setting)
# Update or create webhook secret setting
if webhook_secret:
setting = db.query(SystemSettings).filter(
SystemSettings.key == "stripe_webhook_secret"
).first()
if setting:
setting.value = webhook_secret
setting.updated_by_id = current_user.id
else:
setting = SystemSettings(
key="stripe_webhook_secret",
value=webhook_secret,
description="Stripe webhook secret for verifying webhook events",
updated_by_id=current_user.id
)
db.add(setting)
db.commit()
# Return masked values
def mask_key(key_value: str) -> str:
if not key_value or len(key_value) < 4:
return ""
return "*" * (len(key_value) - 4) + key_value[-4:]
return {
"status": "success",
"message": "Stripe settings updated successfully",
"data": {
"stripe_secret_key": secret_key if secret_key else "",
"stripe_publishable_key": publishable_key,
"stripe_webhook_secret": webhook_secret if webhook_secret else "",
"stripe_secret_key_masked": mask_key(secret_key) if secret_key else "",
"stripe_webhook_secret_masked": mask_key(webhook_secret) if webhook_secret else "",
"has_secret_key": bool(secret_key),
"has_publishable_key": bool(publishable_key),
"has_webhook_secret": bool(webhook_secret),
}
}
except HTTPException:
raise
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -66,6 +66,7 @@ async def get_users(
"phone_number": user.phone, # For frontend compatibility
"address": user.address,
"avatar": user.avatar,
"currency": getattr(user, 'currency', 'VND'),
"is_active": user.is_active,
"status": "active" if user.is_active else "inactive",
"role_id": user.role_id,
@@ -117,6 +118,7 @@ async def get_user_by_id(
"phone_number": user.phone,
"address": user.address,
"avatar": user.avatar,
"currency": getattr(user, 'currency', 'VND'),
"is_active": user.is_active,
"status": "active" if user.is_active else "inactive",
"role_id": user.role_id,
@@ -194,6 +196,7 @@ async def create_user(
"full_name": user.full_name,
"phone": user.phone,
"phone_number": user.phone,
"currency": getattr(user, 'currency', 'VND'),
"role_id": user.role_id,
"is_active": user.is_active,
}
@@ -248,6 +251,10 @@ async def update_user(
user.role_id = role_map.get(user_data["role"], 3)
if "status" in user_data and current_user.role_id == 1:
user.is_active = user_data["status"] == "active"
if "currency" in user_data:
currency = user_data["currency"]
if len(currency) == 3 and currency.isalpha():
user.currency = currency.upper()
if "password" in user_data:
password_bytes = user_data["password"].encode('utf-8')
salt = bcrypt.gensalt()
@@ -263,6 +270,7 @@ async def update_user(
"full_name": user.full_name,
"phone": user.phone,
"phone_number": user.phone,
"currency": getattr(user, 'currency', 'VND'),
"role_id": user.role_id,
"is_active": user.is_active,
}

View File

@@ -81,6 +81,7 @@ class AuthService:
"email": user.email,
"phone": user.phone,
"avatar": user.avatar,
"currency": getattr(user, 'currency', 'VND'),
"role": user.role.name if user.role else "customer",
"createdAt": user.created_at.isoformat() if user.created_at else None,
"updatedAt": user.updated_at.isoformat() if user.updated_at else None,
@@ -265,7 +266,8 @@ class AuthService:
email: Optional[str] = None,
phone_number: Optional[str] = None,
password: Optional[str] = None,
current_password: Optional[str] = None
current_password: Optional[str] = None,
currency: Optional[str] = None
) -> dict:
"""Update user profile"""
user = db.query(User).filter(User.id == user_id).first()
@@ -295,6 +297,12 @@ class AuthService:
user.email = email
if phone_number is not None:
user.phone = phone_number
if currency is not None:
# Validate currency code (ISO 4217, 3 characters)
if len(currency) == 3 and currency.isalpha():
user.currency = currency.upper()
else:
raise ValueError("Invalid currency code. Must be a 3-letter ISO 4217 code (e.g., USD, EUR, VND)")
db.commit()
db.refresh(user)

View File

@@ -0,0 +1,101 @@
"""
Currency conversion service
Handles currency conversion between different currencies
"""
from typing import Dict
from decimal import Decimal
# Base currency is VND (Vietnamese Dong)
# Exchange rates relative to VND (1 VND = base)
# These are approximate rates - in production, fetch from an API like exchangerate-api.com
EXCHANGE_RATES: Dict[str, Decimal] = {
'VND': Decimal('1.0'), # Base currency
'USD': Decimal('0.000041'), # 1 VND = 0.000041 USD (approx 24,000 VND = 1 USD)
'EUR': Decimal('0.000038'), # 1 VND = 0.000038 EUR (approx 26,000 VND = 1 EUR)
'GBP': Decimal('0.000033'), # 1 VND = 0.000033 GBP (approx 30,000 VND = 1 GBP)
'JPY': Decimal('0.0061'), # 1 VND = 0.0061 JPY (approx 164 VND = 1 JPY)
'CNY': Decimal('0.00029'), # 1 VND = 0.00029 CNY (approx 3,400 VND = 1 CNY)
'KRW': Decimal('0.055'), # 1 VND = 0.055 KRW (approx 18 VND = 1 KRW)
'SGD': Decimal('0.000055'), # 1 VND = 0.000055 SGD (approx 18,000 VND = 1 SGD)
'THB': Decimal('0.0015'), # 1 VND = 0.0015 THB (approx 667 VND = 1 THB)
'AUD': Decimal('0.000062'), # 1 VND = 0.000062 AUD (approx 16,000 VND = 1 AUD)
'CAD': Decimal('0.000056'), # 1 VND = 0.000056 CAD (approx 18,000 VND = 1 CAD)
}
# Supported currencies list
SUPPORTED_CURRENCIES = list(EXCHANGE_RATES.keys())
class CurrencyService:
"""Service for currency conversion"""
@staticmethod
def get_supported_currencies() -> list:
"""Get list of supported currency codes"""
return SUPPORTED_CURRENCIES
@staticmethod
def convert_amount(amount: float, from_currency: str, to_currency: str) -> float:
"""
Convert amount from one currency to another
Args:
amount: Amount to convert
from_currency: Source currency code (ISO 4217)
to_currency: Target currency code (ISO 4217)
Returns:
Converted amount
"""
from_currency = from_currency.upper()
to_currency = to_currency.upper()
if from_currency == to_currency:
return amount
if from_currency not in EXCHANGE_RATES:
raise ValueError(f"Unsupported source currency: {from_currency}")
if to_currency not in EXCHANGE_RATES:
raise ValueError(f"Unsupported target currency: {to_currency}")
# Convert to VND first, then to target currency
amount_vnd = Decimal(str(amount)) / EXCHANGE_RATES[from_currency]
converted_amount = amount_vnd * EXCHANGE_RATES[to_currency]
return float(converted_amount)
@staticmethod
def get_exchange_rate(from_currency: str, to_currency: str) -> float:
"""
Get exchange rate between two currencies
Args:
from_currency: Source currency code
to_currency: Target currency code
Returns:
Exchange rate (1 from_currency = X to_currency)
"""
from_currency = from_currency.upper()
to_currency = to_currency.upper()
if from_currency == to_currency:
return 1.0
if from_currency not in EXCHANGE_RATES:
raise ValueError(f"Unsupported source currency: {from_currency}")
if to_currency not in EXCHANGE_RATES:
raise ValueError(f"Unsupported target currency: {to_currency}")
# Rate = (1 / from_rate) * to_rate
rate = EXCHANGE_RATES[to_currency] / EXCHANGE_RATES[from_currency]
return float(rate)
@staticmethod
def format_currency_code(currency: str) -> str:
"""Format currency code to uppercase"""
return currency.upper() if currency else 'VND'
currency_service = CurrencyService()

View File

@@ -0,0 +1,388 @@
"""
Invoice service for managing invoices
"""
from sqlalchemy.orm import Session
from sqlalchemy import func, and_, or_
from typing import Optional, Dict, Any, List
from datetime import datetime, timedelta
from ..models.invoice import Invoice, InvoiceItem, InvoiceStatus
from ..models.booking import Booking
from ..models.payment import Payment, PaymentStatus
from ..models.user import User
def generate_invoice_number(db: Session) -> str:
"""Generate a unique invoice number"""
# Format: INV-YYYYMMDD-XXXX
today = datetime.utcnow().strftime("%Y%m%d")
# Get the last invoice number for today
last_invoice = db.query(Invoice).filter(
Invoice.invoice_number.like(f"INV-{today}-%")
).order_by(Invoice.invoice_number.desc()).first()
if last_invoice:
# Extract the sequence number and increment
try:
sequence = int(last_invoice.invoice_number.split("-")[-1])
sequence += 1
except (ValueError, IndexError):
sequence = 1
else:
sequence = 1
return f"INV-{today}-{sequence:04d}"
class InvoiceService:
"""Service for managing invoices"""
@staticmethod
def create_invoice_from_booking(
booking_id: int,
db: Session,
created_by_id: Optional[int] = None,
tax_rate: float = 0.0,
discount_amount: float = 0.0,
due_days: int = 30,
**kwargs
) -> Dict[str, Any]:
"""
Create an invoice from a booking
Args:
booking_id: Booking ID
db: Database session
created_by_id: User ID who created the invoice
tax_rate: Tax rate percentage (default: 0.0)
discount_amount: Discount amount (default: 0.0)
due_days: Number of days until due date (default: 30)
**kwargs: Additional invoice fields (company info, notes, etc.)
Returns:
Invoice dictionary
"""
booking = db.query(Booking).filter(Booking.id == booking_id).first()
if not booking:
raise ValueError("Booking not found")
user = db.query(User).filter(User.id == booking.user_id).first()
if not user:
raise ValueError("User not found")
# Generate invoice number
invoice_number = generate_invoice_number(db)
# Calculate amounts
subtotal = float(booking.total_price)
tax_amount = (subtotal - discount_amount) * (tax_rate / 100)
total_amount = subtotal + tax_amount - discount_amount
# Calculate amount paid from completed payments
amount_paid = sum(
float(p.amount) for p in booking.payments
if p.payment_status == PaymentStatus.completed
)
balance_due = total_amount - amount_paid
# Determine status
if balance_due <= 0:
status = InvoiceStatus.paid
paid_date = datetime.utcnow()
elif amount_paid > 0:
status = InvoiceStatus.sent
paid_date = None
else:
status = InvoiceStatus.draft
paid_date = None
# Create invoice
invoice = Invoice(
invoice_number=invoice_number,
booking_id=booking_id,
user_id=booking.user_id,
issue_date=datetime.utcnow(),
due_date=datetime.utcnow() + timedelta(days=due_days),
paid_date=paid_date,
subtotal=subtotal,
tax_rate=tax_rate,
tax_amount=tax_amount,
discount_amount=discount_amount,
total_amount=total_amount,
amount_paid=amount_paid,
balance_due=balance_due,
status=status,
company_name=kwargs.get("company_name"),
company_address=kwargs.get("company_address"),
company_phone=kwargs.get("company_phone"),
company_email=kwargs.get("company_email"),
company_tax_id=kwargs.get("company_tax_id"),
company_logo_url=kwargs.get("company_logo_url"),
customer_name=user.full_name or f"{user.email}",
customer_email=user.email,
customer_address=user.address,
customer_phone=user.phone,
customer_tax_id=kwargs.get("customer_tax_id"),
notes=kwargs.get("notes"),
terms_and_conditions=kwargs.get("terms_and_conditions"),
payment_instructions=kwargs.get("payment_instructions"),
created_by_id=created_by_id,
)
db.add(invoice)
# Create invoice items from booking
# Room item
room_item = InvoiceItem(
invoice_id=invoice.id,
description=f"Room: {booking.room.room_number} - {booking.room.room_type.name if booking.room.room_type else 'N/A'}",
quantity=1,
unit_price=float(booking.total_price),
tax_rate=tax_rate,
discount_amount=0.0,
line_total=float(booking.total_price),
room_id=booking.room_id,
)
db.add(room_item)
# Add service items if any
for service_usage in booking.service_usages:
service_item = InvoiceItem(
invoice_id=invoice.id,
description=f"Service: {service_usage.service.name}",
quantity=float(service_usage.quantity),
unit_price=float(service_usage.service.price),
tax_rate=tax_rate,
discount_amount=0.0,
line_total=float(service_usage.quantity) * float(service_usage.service.price),
service_id=service_usage.service_id,
)
db.add(service_item)
subtotal += float(service_usage.quantity) * float(service_usage.service.price)
# Recalculate totals if services were added
if booking.service_usages:
tax_amount = (subtotal - discount_amount) * (tax_rate / 100)
total_amount = subtotal + tax_amount - discount_amount
balance_due = total_amount - amount_paid
invoice.subtotal = subtotal
invoice.tax_amount = tax_amount
invoice.total_amount = total_amount
invoice.balance_due = balance_due
db.commit()
db.refresh(invoice)
return InvoiceService.invoice_to_dict(invoice)
@staticmethod
def update_invoice(
invoice_id: int,
db: Session,
updated_by_id: Optional[int] = None,
**kwargs
) -> Dict[str, Any]:
"""
Update an invoice
Args:
invoice_id: Invoice ID
db: Database session
updated_by_id: User ID who updated the invoice
**kwargs: Fields to update
Returns:
Updated invoice dictionary
"""
invoice = db.query(Invoice).filter(Invoice.id == invoice_id).first()
if not invoice:
raise ValueError("Invoice not found")
# Update allowed fields
allowed_fields = [
"company_name", "company_address", "company_phone", "company_email",
"company_tax_id", "company_logo_url", "notes", "terms_and_conditions",
"payment_instructions", "status", "due_date", "tax_rate", "discount_amount"
]
for field in allowed_fields:
if field in kwargs:
setattr(invoice, field, kwargs[field])
# Recalculate if tax_rate or discount_amount changed
if "tax_rate" in kwargs or "discount_amount" in kwargs:
tax_rate = kwargs.get("tax_rate", invoice.tax_rate)
discount_amount = kwargs.get("discount_amount", invoice.discount_amount)
invoice.tax_amount = (invoice.subtotal - discount_amount) * (float(tax_rate) / 100)
invoice.total_amount = invoice.subtotal + invoice.tax_amount - discount_amount
invoice.balance_due = invoice.total_amount - invoice.amount_paid
# Update status based on balance
if invoice.balance_due <= 0 and invoice.status != InvoiceStatus.paid:
invoice.status = InvoiceStatus.paid
invoice.paid_date = datetime.utcnow()
elif invoice.balance_due > 0 and invoice.status == InvoiceStatus.paid:
invoice.status = InvoiceStatus.sent
invoice.paid_date = None
invoice.updated_by_id = updated_by_id
invoice.updated_at = datetime.utcnow()
db.commit()
db.refresh(invoice)
return InvoiceService.invoice_to_dict(invoice)
@staticmethod
def mark_invoice_as_paid(
invoice_id: int,
db: Session,
amount: Optional[float] = None,
updated_by_id: Optional[int] = None
) -> Dict[str, Any]:
"""
Mark an invoice as paid
Args:
invoice_id: Invoice ID
db: Database session
amount: Payment amount (if None, uses balance_due)
updated_by_id: User ID who marked as paid
Returns:
Updated invoice dictionary
"""
invoice = db.query(Invoice).filter(Invoice.id == invoice_id).first()
if not invoice:
raise ValueError("Invoice not found")
payment_amount = amount if amount is not None else float(invoice.balance_due)
invoice.amount_paid += payment_amount
invoice.balance_due = invoice.total_amount - invoice.amount_paid
if invoice.balance_due <= 0:
invoice.status = InvoiceStatus.paid
invoice.paid_date = datetime.utcnow()
else:
invoice.status = InvoiceStatus.sent
invoice.updated_by_id = updated_by_id
invoice.updated_at = datetime.utcnow()
db.commit()
db.refresh(invoice)
return InvoiceService.invoice_to_dict(invoice)
@staticmethod
def get_invoice(invoice_id: int, db: Session) -> Optional[Dict[str, Any]]:
"""Get invoice by ID"""
invoice = db.query(Invoice).filter(Invoice.id == invoice_id).first()
if not invoice:
return None
return InvoiceService.invoice_to_dict(invoice)
@staticmethod
def get_invoices(
db: Session,
user_id: Optional[int] = None,
booking_id: Optional[int] = None,
status: Optional[str] = None,
page: int = 1,
limit: int = 10
) -> Dict[str, Any]:
"""
Get invoices with filters
Args:
db: Database session
user_id: Filter by user ID
booking_id: Filter by booking ID
status: Filter by status
page: Page number
limit: Items per page
Returns:
Dictionary with invoices and pagination info
"""
query = db.query(Invoice)
if user_id:
query = query.filter(Invoice.user_id == user_id)
if booking_id:
query = query.filter(Invoice.booking_id == booking_id)
if status:
try:
status_enum = InvoiceStatus(status)
query = query.filter(Invoice.status == status_enum)
except ValueError:
pass
# Get total count
total = query.count()
# Apply pagination
offset = (page - 1) * limit
invoices = query.order_by(Invoice.created_at.desc()).offset(offset).limit(limit).all()
return {
"invoices": [InvoiceService.invoice_to_dict(inv) for inv in invoices],
"total": total,
"page": page,
"limit": limit,
"total_pages": (total + limit - 1) // limit
}
@staticmethod
def invoice_to_dict(invoice: Invoice) -> Dict[str, Any]:
"""Convert invoice model to dictionary"""
return {
"id": invoice.id,
"invoice_number": invoice.invoice_number,
"booking_id": invoice.booking_id,
"user_id": invoice.user_id,
"issue_date": invoice.issue_date.isoformat() if invoice.issue_date else None,
"due_date": invoice.due_date.isoformat() if invoice.due_date else None,
"paid_date": invoice.paid_date.isoformat() if invoice.paid_date else None,
"subtotal": float(invoice.subtotal) if invoice.subtotal else 0.0,
"tax_rate": float(invoice.tax_rate) if invoice.tax_rate else 0.0,
"tax_amount": float(invoice.tax_amount) if invoice.tax_amount else 0.0,
"discount_amount": float(invoice.discount_amount) if invoice.discount_amount else 0.0,
"total_amount": float(invoice.total_amount) if invoice.total_amount else 0.0,
"amount_paid": float(invoice.amount_paid) if invoice.amount_paid else 0.0,
"balance_due": float(invoice.balance_due) if invoice.balance_due else 0.0,
"status": invoice.status.value if invoice.status else None,
"company_name": invoice.company_name,
"company_address": invoice.company_address,
"company_phone": invoice.company_phone,
"company_email": invoice.company_email,
"company_tax_id": invoice.company_tax_id,
"company_logo_url": invoice.company_logo_url,
"customer_name": invoice.customer_name,
"customer_email": invoice.customer_email,
"customer_address": invoice.customer_address,
"customer_phone": invoice.customer_phone,
"customer_tax_id": invoice.customer_tax_id,
"notes": invoice.notes,
"terms_and_conditions": invoice.terms_and_conditions,
"payment_instructions": invoice.payment_instructions,
"items": [
{
"id": item.id,
"description": item.description,
"quantity": float(item.quantity) if item.quantity else 0.0,
"unit_price": float(item.unit_price) if item.unit_price else 0.0,
"tax_rate": float(item.tax_rate) if item.tax_rate else 0.0,
"discount_amount": float(item.discount_amount) if item.discount_amount else 0.0,
"line_total": float(item.line_total) if item.line_total else 0.0,
"room_id": item.room_id,
"service_id": item.service_id,
}
for item in invoice.items
],
"created_at": invoice.created_at.isoformat() if invoice.created_at else None,
"updated_at": invoice.updated_at.isoformat() if invoice.updated_at else None,
}

View File

@@ -40,7 +40,20 @@ def normalize_images(images, base_url: str) -> List[str]:
def get_base_url(request) -> str:
"""Get base URL for image normalization"""
return os.getenv("SERVER_URL") or f"http://{request.headers.get('host', 'localhost:3000')}"
# Try to get from environment first
server_url = os.getenv("SERVER_URL")
if server_url:
return server_url.rstrip('/')
# Get from request host header
host = request.headers.get('host', 'localhost:8000')
# Ensure we use the backend port if host doesn't have a port
if ':' not in host:
host = f"{host}:8000"
# Use http or https based on scheme
scheme = request.url.scheme if hasattr(request.url, 'scheme') else 'http'
return f"{scheme}://{host}"
async def get_rooms_with_ratings(
@@ -72,6 +85,9 @@ async def get_rooms_with_ratings(
"price": float(room.price) if room.price else 0.0,
"featured": room.featured,
"description": room.description,
"capacity": room.capacity,
"room_size": room.room_size,
"view": room.view,
"amenities": room.amenities,
"created_at": room.created_at.isoformat() if room.created_at else None,
"updated_at": room.updated_at.isoformat() if room.updated_at else None,
@@ -102,44 +118,319 @@ async def get_rooms_with_ratings(
return result
def get_predefined_amenities() -> List[str]:
"""Get comprehensive list of predefined hotel room amenities"""
return [
# Basic Amenities
"Free WiFi",
"WiFi",
"High-Speed Internet",
"WiFi in Room",
# Entertainment
"Flat-Screen TV",
"TV",
"Cable TV",
"Satellite TV",
"Smart TV",
"Netflix",
"Streaming Services",
"DVD Player",
"Stereo System",
"Radio",
"iPod Dock",
# Climate Control
"Air Conditioning",
"AC",
"Heating",
"Climate Control",
"Ceiling Fan",
"Air Purifier",
# Bathroom Features
"Private Bathroom",
"Ensuite Bathroom",
"Bathtub",
"Jacuzzi Bathtub",
"Hot Tub",
"Shower",
"Rain Shower",
"Walk-in Shower",
"Bidet",
"Hair Dryer",
"Hairdryer",
"Bathrobes",
"Slippers",
"Toiletries",
"Premium Toiletries",
"Towels",
# Food & Beverage
"Mini Bar",
"Minibar",
"Refrigerator",
"Fridge",
"Microwave",
"Coffee Maker",
"Electric Kettle",
"Tea Making Facilities",
"Coffee Machine",
"Nespresso Machine",
"Kitchenette",
"Dining Table",
"Room Service",
"Breakfast Included",
"Breakfast",
"Complimentary Water",
"Bottled Water",
# Furniture & Space
"Desk",
"Writing Desk",
"Office Desk",
"Work Desk",
"Sofa",
"Sitting Area",
"Lounge Area",
"Dining Area",
"Separate Living Area",
"Wardrobe",
"Closet",
"Dresser",
"Mirror",
"Full-Length Mirror",
"Seating Area",
# Bed & Sleep
"King Size Bed",
"Queen Size Bed",
"Double Bed",
"Twin Beds",
"Single Bed",
"Extra Bedding",
"Pillow Menu",
"Premium Bedding",
"Blackout Curtains",
"Soundproofing",
# Safety & Security
"Safe",
"In-Room Safe",
"Safety Deposit Box",
"Smoke Detector",
"Fire Extinguisher",
"Security System",
"Key Card Access",
"Door Lock",
"Pepper Spray",
# Technology
"USB Charging Ports",
"USB Ports",
"USB Outlets",
"Power Outlets",
"Charging Station",
"Laptop Safe",
"HDMI Port",
"Phone",
"Desk Phone",
"Wake-Up Service",
"Alarm Clock",
"Digital Clock",
# View & Outdoor
"Balcony",
"Private Balcony",
"Terrace",
"Patio",
"City View",
"Ocean View",
"Sea View",
"Mountain View",
"Garden View",
"Pool View",
"Park View",
"Window",
"Large Windows",
"Floor-to-Ceiling Windows",
# Services
"24-Hour Front Desk",
"24 Hour Front Desk",
"24/7 Front Desk",
"Concierge Service",
"Butler Service",
"Housekeeping",
"Daily Housekeeping",
"Turndown Service",
"Laundry Service",
"Dry Cleaning",
"Ironing Service",
"Luggage Storage",
"Bell Service",
"Valet Parking",
"Parking",
"Free Parking",
"Airport Shuttle",
"Shuttle Service",
"Car Rental",
"Taxi Service",
# Fitness & Wellness
"Gym Access",
"Fitness Center",
"Fitness Room",
"Spa Access",
"Spa",
"Sauna",
"Steam Room",
"Hot Tub",
"Massage Service",
"Beauty Services",
# Recreation
"Swimming Pool",
"Pool",
"Indoor Pool",
"Outdoor Pool",
"Infinity Pool",
"Pool Access",
"Golf Course",
"Tennis Court",
"Beach Access",
"Water Sports",
# Business & Work
"Business Center",
"Meeting Room",
"Conference Room",
"Fax Service",
"Photocopying",
"Printing Service",
"Secretarial Services",
# Accessibility
"Wheelchair Accessible",
"Accessible Room",
"Elevator Access",
"Ramp Access",
"Accessible Bathroom",
"Lowered Sink",
"Grab Bars",
"Hearing Accessible",
"Visual Alarm",
# Family & Pets
"Family Room",
"Kids Welcome",
"Baby Crib",
"Extra Bed",
"Crib",
"Childcare Services",
"Pets Allowed",
"Pet Friendly",
# Additional Features
"Smoking Room",
"Non-Smoking Room",
"No Smoking",
"Interconnecting Rooms",
"Adjoining Rooms",
"Suite",
"Separate Bedroom",
"Kitchen",
"Full Kitchen",
"Dishwasher",
"Oven",
"Stove",
"Washing Machine",
"Dryer",
"Iron",
"Ironing Board",
"Clothes Rack",
"Umbrella",
"Shoe Shine Service",
# Luxury Features
"Fireplace",
"Jacuzzi",
"Steam Shower",
"Spa Bath",
"Bidet Toilet",
"Smart Home System",
"Lighting Control",
"Curtain Control",
"Automated Systems",
"Personalized Service",
"VIP Treatment",
"Butler",
"Private Entrance",
"Private Elevator",
"Panic Button",
# Entertainment & Media
"Blu-ray Player",
"Gaming Console",
"PlayStation",
"Xbox",
"Sound System",
"Surround Sound",
"Music System",
# Special Features
"Library",
"Reading Room",
"Study Room",
"Private Pool",
"Private Garden",
"Yard",
"Courtyard",
"Outdoor Furniture",
"BBQ Facilities",
"Picnic Area",
]
async def get_amenities_list(db: Session) -> List[str]:
"""Get all unique amenities from room types and rooms"""
all_amenities = []
"""Get all unique amenities from room types and rooms, plus predefined amenities"""
# Start with predefined comprehensive list
all_amenities = set(get_predefined_amenities())
# Get from room types
room_types = db.query(RoomType.amenities).all()
for rt in room_types:
if rt.amenities:
if isinstance(rt.amenities, list):
all_amenities.extend([str(a).strip() for a in rt.amenities])
all_amenities.update([str(a).strip() for a in rt.amenities if str(a).strip()])
elif isinstance(rt.amenities, str):
try:
import json
parsed = json.loads(rt.amenities)
if isinstance(parsed, list):
all_amenities.extend([str(a).strip() for a in parsed])
all_amenities.update([str(a).strip() for a in parsed if str(a).strip()])
else:
all_amenities.extend([s.strip() for s in rt.amenities.split(',')])
all_amenities.update([s.strip() for s in rt.amenities.split(',') if s.strip()])
except:
all_amenities.extend([s.strip() for s in rt.amenities.split(',')])
all_amenities.update([s.strip() for s in rt.amenities.split(',') if s.strip()])
# Get from rooms
rooms = db.query(Room.amenities).all()
for r in rooms:
if r.amenities:
if isinstance(r.amenities, list):
all_amenities.extend([str(a).strip() for a in r.amenities])
all_amenities.update([str(a).strip() for a in r.amenities if str(a).strip()])
elif isinstance(r.amenities, str):
try:
import json
parsed = json.loads(r.amenities)
if isinstance(parsed, list):
all_amenities.extend([str(a).strip() for a in parsed])
all_amenities.update([str(a).strip() for a in parsed if str(a).strip()])
else:
all_amenities.extend([s.strip() for s in r.amenities.split(',')])
all_amenities.update([s.strip() for s in r.amenities.split(',') if s.strip()])
except:
all_amenities.extend([s.strip() for s in r.amenities.split(',')])
all_amenities.update([s.strip() for s in r.amenities.split(',') if s.strip()])
# Return unique, non-empty values
return sorted(list(set([a for a in all_amenities if a])))
# Return unique, sorted values
return sorted(list(all_amenities))

View File

@@ -0,0 +1,409 @@
"""
Stripe payment service for processing card payments
"""
import stripe
from typing import Optional, Dict, Any
from ..config.settings import settings
from ..models.payment import Payment, PaymentMethod, PaymentType, PaymentStatus
from ..models.booking import Booking, BookingStatus
from ..models.system_settings import SystemSettings
from sqlalchemy.orm import Session
from datetime import datetime
def get_stripe_secret_key(db: Session) -> Optional[str]:
"""Get Stripe secret key from database or environment variable"""
try:
setting = db.query(SystemSettings).filter(
SystemSettings.key == "stripe_secret_key"
).first()
if setting and setting.value:
return setting.value
except Exception:
pass
# Fallback to environment variable
return settings.STRIPE_SECRET_KEY if settings.STRIPE_SECRET_KEY else None
def get_stripe_publishable_key(db: Session) -> Optional[str]:
"""Get Stripe publishable key from database or environment variable"""
try:
setting = db.query(SystemSettings).filter(
SystemSettings.key == "stripe_publishable_key"
).first()
if setting and setting.value:
return setting.value
except Exception:
pass
# Fallback to environment variable
return settings.STRIPE_PUBLISHABLE_KEY if settings.STRIPE_PUBLISHABLE_KEY else None
def get_stripe_webhook_secret(db: Session) -> Optional[str]:
"""Get Stripe webhook secret from database or environment variable"""
try:
setting = db.query(SystemSettings).filter(
SystemSettings.key == "stripe_webhook_secret"
).first()
if setting and setting.value:
return setting.value
except Exception:
pass
# Fallback to environment variable
return settings.STRIPE_WEBHOOK_SECRET if settings.STRIPE_WEBHOOK_SECRET else None
class StripeService:
"""Service for handling Stripe payments"""
@staticmethod
def create_payment_intent(
amount: float,
currency: str = "usd",
metadata: Optional[Dict[str, Any]] = None,
customer_id: Optional[str] = None,
db: Optional[Session] = None
) -> Dict[str, Any]:
"""
Create a Stripe Payment Intent
Args:
amount: Payment amount in smallest currency unit (cents for USD)
currency: Currency code (default: usd)
metadata: Additional metadata to attach to the payment intent
customer_id: Optional Stripe customer ID
db: Optional database session to get keys from database
Returns:
Payment intent object
"""
# Get secret key from database or environment
secret_key = None
if db:
secret_key = get_stripe_secret_key(db)
if not secret_key:
secret_key = settings.STRIPE_SECRET_KEY
if not secret_key:
raise ValueError("Stripe secret key is not configured")
# Set the API key for this request
stripe.api_key = secret_key
# Validate amount is reasonable (Stripe max is $999,999.99)
if amount <= 0:
raise ValueError("Amount must be greater than 0")
if amount > 999999.99:
raise ValueError(f"Amount ${amount:,.2f} exceeds Stripe's maximum of $999,999.99")
# Convert amount to cents (smallest currency unit)
# Amount should be in dollars, so multiply by 100 to get cents
amount_in_cents = int(round(amount * 100))
# Double-check the cents amount doesn't exceed Stripe's limit
if amount_in_cents > 99999999: # $999,999.99 in cents
raise ValueError(f"Amount ${amount:,.2f} (${amount_in_cents} cents) exceeds Stripe's maximum")
intent_params = {
"amount": amount_in_cents,
"currency": currency,
"automatic_payment_methods": {
"enabled": True,
},
"metadata": metadata or {},
}
if customer_id:
intent_params["customer"] = customer_id
try:
intent = stripe.PaymentIntent.create(**intent_params)
return {
"client_secret": intent.client_secret,
"id": intent.id,
"status": intent.status,
"amount": intent.amount,
"currency": intent.currency,
}
except stripe.StripeError as e:
raise ValueError(f"Stripe error: {str(e)}")
@staticmethod
def retrieve_payment_intent(
payment_intent_id: str,
db: Optional[Session] = None
) -> Dict[str, Any]:
"""
Retrieve a payment intent by ID
Args:
payment_intent_id: Stripe payment intent ID
db: Optional database session to get keys from database
Returns:
Payment intent object
"""
# Get secret key from database or environment
secret_key = None
if db:
secret_key = get_stripe_secret_key(db)
if not secret_key:
secret_key = settings.STRIPE_SECRET_KEY
if not secret_key:
raise ValueError("Stripe secret key is not configured")
# Set the API key for this request
stripe.api_key = secret_key
try:
intent = stripe.PaymentIntent.retrieve(payment_intent_id)
# Safely access charges - they may not exist on all payment intents
charges = []
if hasattr(intent, 'charges') and intent.charges:
charges_data = getattr(intent.charges, 'data', [])
charges = [
{
"id": charge.id,
"paid": charge.paid,
"status": charge.status,
}
for charge in charges_data
]
return {
"id": intent.id,
"status": intent.status,
"amount": intent.amount / 100, # Convert from cents
"currency": intent.currency,
"metadata": intent.metadata,
"charges": charges,
}
except stripe.StripeError as e:
raise ValueError(f"Stripe error: {str(e)}")
@staticmethod
def confirm_payment(
payment_intent_id: str,
db: Session,
booking_id: Optional[int] = None
) -> Dict[str, Any]:
"""
Confirm a payment and update database records
Args:
payment_intent_id: Stripe payment intent ID
db: Database session
booking_id: Optional booking ID for metadata lookup
Returns:
Payment record dictionary
"""
try:
intent_data = StripeService.retrieve_payment_intent(payment_intent_id, db)
# Find or get booking_id from metadata
if not booking_id and intent_data.get("metadata"):
booking_id = intent_data["metadata"].get("booking_id")
if booking_id:
booking_id = int(booking_id)
if not booking_id:
raise ValueError("Booking ID is required")
booking = db.query(Booking).filter(Booking.id == booking_id).first()
if not booking:
raise ValueError("Booking not found")
# Check payment intent status
payment_status = intent_data.get("status")
print(f"Payment intent status: {payment_status}")
# Accept succeeded or processing status (processing means payment is being processed)
if payment_status not in ["succeeded", "processing"]:
raise ValueError(f"Payment intent not in a valid state. Status: {payment_status}. Payment may still be processing or may have failed.")
# Find existing payment or create new one
payment = db.query(Payment).filter(
Payment.booking_id == booking_id,
Payment.transaction_id == payment_intent_id,
Payment.payment_method == PaymentMethod.stripe
).first()
amount = intent_data["amount"]
if payment:
# Update existing payment
# Only mark as completed if payment intent succeeded
if payment_status == "succeeded":
payment.payment_status = PaymentStatus.completed
payment.payment_date = datetime.utcnow()
# If processing, keep as pending (will be updated by webhook)
payment.amount = amount
else:
# Create new payment record
payment_type = PaymentType.full
if booking.requires_deposit and not booking.deposit_paid:
payment_type = PaymentType.deposit
# Only mark as completed if payment intent succeeded
payment_status_enum = PaymentStatus.completed if payment_status == "succeeded" else PaymentStatus.pending
payment_date = datetime.utcnow() if payment_status == "succeeded" else None
payment = Payment(
booking_id=booking_id,
amount=amount,
payment_method=PaymentMethod.stripe,
payment_type=payment_type,
payment_status=payment_status_enum,
transaction_id=payment_intent_id,
payment_date=payment_date,
notes=f"Stripe payment - Intent: {payment_intent_id} (Status: {payment_status})",
)
db.add(payment)
# Commit payment first to ensure it's saved
db.commit()
db.refresh(payment)
# Update booking status only if payment is completed
if payment.payment_status == PaymentStatus.completed:
# Refresh booking to get updated payments relationship
db.refresh(booking)
if payment.payment_type == PaymentType.deposit:
# Mark deposit as paid and confirm booking
booking.deposit_paid = True
if booking.status == BookingStatus.pending:
booking.status = BookingStatus.confirmed
elif payment.payment_type == PaymentType.full:
# Calculate total paid from all completed payments (now includes current payment)
total_paid = sum(
float(p.amount) for p in booking.payments
if p.payment_status == PaymentStatus.completed
)
# Confirm booking if:
# 1. Total paid (all payments) covers the booking price, OR
# 2. This single payment covers the entire booking amount
if total_paid >= float(booking.total_price) or float(payment.amount) >= float(booking.total_price):
booking.status = BookingStatus.confirmed
# Commit booking status update
db.commit()
db.refresh(booking)
# Safely get enum values
def get_enum_value(enum_obj):
"""Safely extract value from enum or return as-is"""
if enum_obj is None:
return None
if isinstance(enum_obj, (PaymentMethod, PaymentType, PaymentStatus)):
return enum_obj.value
return enum_obj
try:
return {
"id": payment.id,
"booking_id": payment.booking_id,
"amount": float(payment.amount) if payment.amount else 0.0,
"payment_method": get_enum_value(payment.payment_method),
"payment_type": get_enum_value(payment.payment_type),
"payment_status": get_enum_value(payment.payment_status),
"transaction_id": payment.transaction_id,
"payment_date": payment.payment_date.isoformat() if payment.payment_date else None,
}
except AttributeError as ae:
print(f"AttributeError accessing payment fields: {ae}")
print(f"Payment object: {payment}")
print(f"Payment payment_method: {payment.payment_method if hasattr(payment, 'payment_method') else 'missing'}")
print(f"Payment payment_type: {payment.payment_type if hasattr(payment, 'payment_type') else 'missing'}")
print(f"Payment payment_status: {payment.payment_status if hasattr(payment, 'payment_status') else 'missing'}")
raise
except ValueError as e:
# Re-raise ValueError as-is (these are expected errors)
db.rollback()
raise
except Exception as e:
import traceback
error_details = traceback.format_exc()
error_msg = str(e) if str(e) else f"{type(e).__name__}: {repr(e)}"
print(f"Error in confirm_payment: {error_msg}")
print(f"Traceback: {error_details}")
db.rollback()
raise ValueError(f"Error confirming payment: {error_msg}")
@staticmethod
def handle_webhook(
payload: bytes,
signature: str,
db: Session
) -> Dict[str, Any]:
"""
Handle Stripe webhook events
Args:
payload: Raw webhook payload
signature: Stripe signature header
db: Database session
Returns:
Webhook event data
"""
webhook_secret = get_stripe_webhook_secret(db)
if not webhook_secret:
webhook_secret = settings.STRIPE_WEBHOOK_SECRET
if not webhook_secret:
raise ValueError("Stripe webhook secret is not configured. Please configure it in Admin Panel (Settings > Stripe Settings) or set STRIPE_WEBHOOK_SECRET environment variable.")
try:
event = stripe.Webhook.construct_event(
payload, signature, webhook_secret
)
except ValueError as e:
raise ValueError(f"Invalid payload: {str(e)}")
except stripe.SignatureVerificationError as e:
raise ValueError(f"Invalid signature: {str(e)}")
# Handle the event
if event["type"] == "payment_intent.succeeded":
payment_intent = event["data"]["object"]
payment_intent_id = payment_intent["id"]
metadata = payment_intent.get("metadata", {})
booking_id = metadata.get("booking_id")
if booking_id:
try:
StripeService.confirm_payment(
payment_intent_id=payment_intent_id,
db=db,
booking_id=int(booking_id)
)
except Exception as e:
print(f"Error processing webhook for booking {booking_id}: {str(e)}")
elif event["type"] == "payment_intent.payment_failed":
payment_intent = event["data"]["object"]
payment_intent_id = payment_intent["id"]
metadata = payment_intent.get("metadata", {})
booking_id = metadata.get("booking_id")
if booking_id:
# Update payment status to failed
payment = db.query(Payment).filter(
Payment.transaction_id == payment_intent_id,
Payment.booking_id == int(booking_id)
).first()
if payment:
payment.payment_status = PaymentStatus.failed
db.commit()
return {
"status": "success",
"event_type": event["type"],
"event_id": event["id"],
}

7
Backend/venv/bin/normalizer Executable file
View File

@@ -0,0 +1,7 @@
#!/home/gnx/Desktop/Hotel-Booking/Backend/venv/bin/python3
import sys
from charset_normalizer.cli import cli_detect
if __name__ == '__main__':
if sys.argv[0].endswith('.exe'):
sys.argv[0] = sys.argv[0][:-4]
sys.exit(cli_detect())

View File

@@ -0,0 +1,78 @@
Metadata-Version: 2.4
Name: certifi
Version: 2025.11.12
Summary: Python package for providing Mozilla's CA Bundle.
Home-page: https://github.com/certifi/python-certifi
Author: Kenneth Reitz
Author-email: me@kennethreitz.com
License: MPL-2.0
Project-URL: Source, https://github.com/certifi/python-certifi
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
Classifier: Natural Language :: English
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Classifier: Programming Language :: Python :: 3.14
Requires-Python: >=3.7
License-File: LICENSE
Dynamic: author
Dynamic: author-email
Dynamic: classifier
Dynamic: description
Dynamic: home-page
Dynamic: license
Dynamic: license-file
Dynamic: project-url
Dynamic: requires-python
Dynamic: summary
Certifi: Python SSL Certificates
================================
Certifi provides Mozilla's carefully curated collection of Root Certificates for
validating the trustworthiness of SSL certificates while verifying the identity
of TLS hosts. It has been extracted from the `Requests`_ project.
Installation
------------
``certifi`` is available on PyPI. Simply install it with ``pip``::
$ pip install certifi
Usage
-----
To reference the installed certificate authority (CA) bundle, you can use the
built-in function::
>>> import certifi
>>> certifi.where()
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
Or from the command line::
$ python -m certifi
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
Enjoy!
.. _`Requests`: https://requests.readthedocs.io/en/master/
Addition/Removal of Certificates
--------------------------------
Certifi does not support any addition/removal or other modification of the
CA trust store content. This project is intended to provide a reliable and
highly portable root of trust to python deployments. Look to upstream projects
for methods to use alternate trust.

View File

@@ -0,0 +1,14 @@
certifi-2025.11.12.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
certifi-2025.11.12.dist-info/METADATA,sha256=_JprGu_1lWSdHlruRBKcorXnrfvBDhvX_6KRr8HQbLc,2475
certifi-2025.11.12.dist-info/RECORD,,
certifi-2025.11.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
certifi-2025.11.12.dist-info/licenses/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
certifi-2025.11.12.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
certifi/__init__.py,sha256=1BRSxNMnZW7CZ2oJtYWLoJgfHfcB9i273exwiPwfjJM,94
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
certifi/__pycache__/__init__.cpython-312.pyc,,
certifi/__pycache__/__main__.cpython-312.pyc,,
certifi/__pycache__/core.cpython-312.pyc,,
certifi/cacert.pem,sha256=oa1dZD4hxDtb7XTH4IkdzbWPavUcis4eTwINZUqlKhY,283932
certifi/core.py,sha256=XFXycndG5pf37ayeF8N32HUuDafsyhkVMbO4BAPWHa0,3394
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: setuptools (80.9.0)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@@ -0,0 +1,20 @@
This package contains a modified version of ca-bundle.crt:
ca-bundle.crt -- Bundle of CA Root Certificates
This is a bundle of X.509 certificates of public Certificate Authorities
(CA). These were automatically extracted from Mozilla's root certificates
file (certdata.txt). This file can be found in the mozilla source tree:
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
It contains the certificates in PEM format and therefore
can be directly used with curl / libcurl / php_curl, or with
an Apache+mod_ssl webserver for SSL client authentication.
Just configure this file as the SSLCACertificateFile.#
***** BEGIN LICENSE BLOCK *****
This Source Code Form is subject to the terms of the Mozilla Public License,
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
one at http://mozilla.org/MPL/2.0/.
***** END LICENSE BLOCK *****
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $

View File

@@ -0,0 +1,4 @@
from .core import contents, where
__all__ = ["contents", "where"]
__version__ = "2025.11.12"

View File

@@ -0,0 +1,12 @@
import argparse
from certifi import contents, where
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--contents", action="store_true")
args = parser.parse_args()
if args.contents:
print(contents())
else:
print(where())

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,83 @@
"""
certifi.py
~~~~~~~~~~
This module returns the installation location of cacert.pem or its contents.
"""
import sys
import atexit
def exit_cacert_ctx() -> None:
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
if sys.version_info >= (3, 11):
from importlib.resources import as_file, files
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the file
# in cases where we're inside of a zipimport situation until someone
# actually calls where(), but we don't want to re-extract the file
# on every call of where(), so we'll do it once then store it in a
# global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you to
# manage the cleanup of this file, so it doesn't actually return a
# path, it returns a context manager that will give you the path
# when you enter it and will do any cleanup when you leave it. In
# the common case of not needing a temporary file, it will just
# return the file system location and the __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
else:
from importlib.resources import path as get_path, read_text
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the
# file in cases where we're inside of a zipimport situation until
# someone actually calls where(), but we don't want to re-extract
# the file on every call of where(), so we'll do it once then store
# it in a global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you
# to manage the cleanup of this file, so it doesn't actually
# return a path, it returns a context manager that will give
# you the path when you enter it and will do any cleanup when
# you leave it. In the common case of not needing a temporary
# file, it will just return the file system location and the
# __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = get_path("certifi", "cacert.pem")
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return read_text("certifi", "cacert.pem", encoding="ascii")

View File

@@ -0,0 +1,764 @@
Metadata-Version: 2.4
Name: charset-normalizer
Version: 3.4.4
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
License: MIT
Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
Project-URL: Code, https://github.com/jawah/charset_normalizer
Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Classifier: Programming Language :: Python :: 3.14
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Text Processing :: Linguistic
Classifier: Topic :: Utilities
Classifier: Typing :: Typed
Requires-Python: >=3.7
Description-Content-Type: text/markdown
License-File: LICENSE
Provides-Extra: unicode-backport
Dynamic: license-file
<h1 align="center">Charset Detection, for Everyone 👋</h1>
<p align="center">
<sup>The Real First Universal Charset Detector</sup><br>
<a href="https://pypi.org/project/charset-normalizer">
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
</a>
<a href="https://pepy.tech/project/charset-normalizer/">
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
</a>
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
</a>
</p>
<p align="center">
<sup><i>Featured Packages</i></sup><br>
<a href="https://github.com/jawah/niquests">
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Most_Advanced_HTTP_Client-cyan">
</a>
<a href="https://github.com/jawah/wassima">
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Replacement-cyan">
</a>
</p>
<p align="center">
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
<a href="https://github.com/nickspring/charset-normalizer-rs">
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
</a>
</p>
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
> I'm trying to resolve the issue by taking a new approach.
> All IANA character set names for which the Python core library provides codecs are supported.
<p align="center">
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
</p>
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
| `Fast` | ❌ | ✅ | ✅ |
| `Universal**` | ❌ | ✅ | ❌ |
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
| `Native Python` | ✅ | ✅ | ❌ |
| `Detect spoken language` | ❌ | ✅ | N/A |
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
<p align="center">
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
</p>
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
## ⚡ Performance
This package offer better performance than its counterpart Chardet. Here are some numbers.
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
| charset-normalizer | 100 ms | 50 ms | 5 ms |
_updated as of december 2024 using CPython 3.12_
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
> And yes, these results might change at any time. The dataset can be updated to include more files.
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
> (e.g. Supported Encoding) Challenge-them if you want.
## ✨ Installation
Using pip:
```sh
pip install charset-normalizer -U
```
## 🚀 Basic Usage
### CLI
This package comes with a CLI.
```
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
file [file ...]
The Real First Universal Charset Detector. Discover originating encoding used
on text file. Normalize text to unicode.
positional arguments:
files File(s) to be analysed
optional arguments:
-h, --help show this help message and exit
-v, --verbose Display complementary information about file if any.
Stdout will contain logs about the detection process.
-a, --with-alternative
Output complementary possibilities if any. Top-level
JSON WILL be a list.
-n, --normalize Permit to normalize input file. If not set, program
does not write anything.
-m, --minimal Only output the charset detected to STDOUT. Disabling
JSON output.
-r, --replace Replace file when trying to normalize it instead of
creating a new one.
-f, --force Replace file without asking if you are sure, use this
flag with caution.
-t THRESHOLD, --threshold THRESHOLD
Define a custom maximum amount of chaos allowed in
decoded content. 0. <= chaos <= 1.
--version Show version information and exit.
```
```bash
normalizer ./data/sample.1.fr.srt
```
or
```bash
python -m charset_normalizer ./data/sample.1.fr.srt
```
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
```json
{
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
"encoding": "cp1252",
"encoding_aliases": [
"1252",
"windows_1252"
],
"alternative_encodings": [
"cp1254",
"cp1256",
"cp1258",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
"mbcs"
],
"language": "French",
"alphabets": [
"Basic Latin",
"Latin-1 Supplement"
],
"has_sig_or_bom": false,
"chaos": 0.149,
"coherence": 97.152,
"unicode_path": null,
"is_preferred": true
}
```
### Python
*Just print out normalized text*
```python
from charset_normalizer import from_path
results = from_path('./my_subtitle.srt')
print(str(results.best()))
```
*Upgrade your code without effort*
```python
from charset_normalizer import detect
```
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
## 😇 Why
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
reliable alternative using a completely different method. Also! I never back down on a good challenge!
I **don't care** about the **originating charset** encoding, because **two different tables** can
produce **two identical rendered string.**
What I want is to get readable text, the best I can.
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
## 🍰 How
- Discard all charset encoding table that could not fit the binary content.
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
- Extract matches with the lowest mess detected.
- Additionally, we measure coherence / probe for a language.
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
improve or rewrite it.
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
## ⚡ Known limitations
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
## ⚠️ About Python EOLs
**If you are running:**
- Python >=2.7,<3.5: Unsupported
- Python 3.5: charset-normalizer < 2.1
- Python 3.6: charset-normalizer < 3.1
- Python 3.7: charset-normalizer < 4.0
Upgrade your Python interpreter as soon as possible.
## 👤 Contributing
Contributions, issues and feature requests are very much welcome.<br />
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
## 📝 License
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
## 💼 For Enterprise
Professional support for charset-normalizer is available as part of the [Tidelift
Subscription][1]. Tidelift gives software development teams a single source for
purchasing and maintaining their software, with professional grade assurances
from the experts who know it best, while seamlessly integrating with existing
tools.
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297)
# Changelog
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [3.4.4](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.4) (2025-10-13)
### Changed
- Bound `setuptools` to a specific constraint `setuptools>=68,<=81`.
- Raised upper bound of mypyc for the optional pre-built extension to v1.18.2
### Removed
- `setuptools-scm` as a build dependency.
### Misc
- Enforced hashes in `dev-requirements.txt` and created `ci-requirements.txt` for security purposes.
- Additional pre-built wheels for riscv64, s390x, and armv7l architectures.
- Restore ` multiple.intoto.jsonl` in GitHub releases in addition to individual attestation file per wheel.
## [3.4.3](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.3) (2025-08-09)
### Changed
- mypy(c) is no longer a required dependency at build time if `CHARSET_NORMALIZER_USE_MYPYC` isn't set to `1`. (#595) (#583)
- automatically lower confidence on small bytes samples that are not Unicode in `detect` output legacy function. (#391)
### Added
- Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
- Support for Python 3.14
### Fixed
- sdist archive contained useless directories.
- automatically fallback on valid UTF-16 or UTF-32 even if the md says it's noisy. (#633)
### Misc
- SBOM are automatically published to the relevant GitHub release to comply with regulatory changes.
Each published wheel comes with its SBOM. We choose CycloneDX as the format.
- Prebuilt optimized wheel are no longer distributed by default for CPython 3.7 due to a change in cibuildwheel.
## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
### Fixed
- Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
- Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
### Changed
- Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
### Changed
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
- Enforce annotation delayed loading for a simpler and consistent types in the project.
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
### Added
- pre-commit configuration.
- noxfile.
### Removed
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
- Unused `utils.range_scan` function.
### Fixed
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
### Added
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
- Support for Python 3.13 (#512)
### Fixed
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
### Fixed
- Unintentional memory usage regression when using large payload that match several encoding (#376)
- Regression on some detection case showcased in the documentation (#371)
### Added
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
### Changed
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
- Improved the general detection reliability based on reports from the community
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
### Added
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
### Removed
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
### Changed
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
### Fixed
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
### Changed
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
- Minor improvement over the global detection reliability
### Added
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
- Explicit support for Python 3.12
### Fixed
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
### Added
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
### Removed
- Support for Python 3.6 (PR #260)
### Changed
- Optional speedup provided by mypy/c 1.0.1
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
### Fixed
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
### Changed
- Speedup provided by mypy/c 0.990 on Python >= 3.7
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
- Sphinx warnings when generating the documentation
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
### Added
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Removed
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
### Fixed
- Sphinx warnings when generating the documentation
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
### Changed
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Removed
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
### Deprecated
- Function `normalize` scheduled for removal in 3.0
### Changed
- Removed useless call to decode in fn is_unprintable (#206)
### Fixed
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
### Added
- Output the Unicode table version when running the CLI with `--version` (PR #194)
### Changed
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
### Fixed
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
### Removed
- Support for Python 3.5 (PR #192)
### Deprecated
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
### Fixed
- ASCII miss-detection on rare cases (PR #170)
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
### Added
- Explicit support for Python 3.11 (PR #164)
### Changed
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
### Fixed
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
### Changed
- Skipping the language-detection (CD) on ASCII (PR #155)
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
### Changed
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
### Fixed
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
### Changed
- Improvement over Vietnamese detection (PR #126)
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
### Fixed
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
- Avoid using too insignificant chunk (PR #137)
### Added
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
### Added
- Add support for Kazakh (Cyrillic) language detection (PR #109)
### Changed
- Further, improve inferring the language from a given single-byte code page (PR #112)
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
- Various detection improvement (MD+CD) (PR #117)
### Removed
- Remove redundant logging entry about detected language(s) (PR #115)
### Fixed
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
### Fixed
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
- Fix CLI crash when using --minimal output in certain cases (PR #103)
### Changed
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
### Changed
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
- The Unicode detection is slightly improved (PR #93)
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
### Removed
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
### Fixed
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
- The MANIFEST.in was not exhaustive (PR #78)
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
### Fixed
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
- Submatch factoring could be wrong in rare edge cases (PR #72)
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
- Fix line endings from CRLF to LF for certain project files (PR #67)
### Changed
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
- Allow fallback on specified encoding if any (PR #71)
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
### Changed
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
### Fixed
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
### Changed
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
### Fixed
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
### Changed
- Public function normalize default args values were not aligned with from_bytes (PR #53)
### Added
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
### Changed
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
- utf_7 detection has been reinstated.
### Removed
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
- The exception hook on UnicodeDecodeError has been removed.
### Deprecated
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
### Fixed
- The CLI output used the relative path of the file(s). Should be absolute.
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
### Fixed
- Logger configuration/usage no longer conflict with others (PR #44)
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
### Removed
- Using standard logging instead of using the package loguru.
- Dropping nose test framework in favor of the maintained pytest.
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
- Stop support for UTF-7 that does not contain a SIG.
- Dropping PrettyTable, replaced with pure JSON output in CLI.
### Fixed
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
- Not searching properly for the BOM when trying utf32/16 parent codec.
### Changed
- Improving the package final size by compressing frequencies.json.
- Huge improvement over the larges payload.
### Added
- CLI now produces JSON consumable output.
- Return ASCII if given sequences fit. Given reasonable confidence.
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
### Fixed
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
### Fixed
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
### Fixed
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
### Changed
- Amend the previous release to allow prettytable 2.0 (PR #35)
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
### Fixed
- Fix error while using the package with a python pre-release interpreter (PR #33)
### Changed
- Dependencies refactoring, constraints revised.
### Added
- Add python 3.9 and 3.10 to the supported interpreters
MIT License
Copyright (c) 2025 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,35 @@
../../../bin/normalizer,sha256=Jbv4iS8Kds1vnENEtxQXFDyfqxed7gWl0INvAYs2yoI,246
charset_normalizer-3.4.4.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
charset_normalizer-3.4.4.dist-info/METADATA,sha256=jVuUFBti8dav19YLvWissTihVdF2ozUY4KKMw7jdkBQ,37303
charset_normalizer-3.4.4.dist-info/RECORD,,
charset_normalizer-3.4.4.dist-info/WHEEL,sha256=DxRnWQz-Kp9-4a4hdDHsSv0KUC3H7sN9Nbef3-8RjXU,190
charset_normalizer-3.4.4.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
charset_normalizer-3.4.4.dist-info/licenses/LICENSE,sha256=bQ1Bv-FwrGx9wkjJpj4lTQ-0WmDVCoJX0K-SxuJJuIc,1071
charset_normalizer-3.4.4.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
charset_normalizer/__init__.py,sha256=OKRxRv2Zhnqk00tqkN0c1BtJjm165fWXLydE52IKuHc,1590
charset_normalizer/__main__.py,sha256=yzYxMR-IhKRHYwcSlavEv8oGdwxsR89mr2X09qXGdps,109
charset_normalizer/__pycache__/__init__.cpython-312.pyc,,
charset_normalizer/__pycache__/__main__.cpython-312.pyc,,
charset_normalizer/__pycache__/api.cpython-312.pyc,,
charset_normalizer/__pycache__/cd.cpython-312.pyc,,
charset_normalizer/__pycache__/constant.cpython-312.pyc,,
charset_normalizer/__pycache__/legacy.cpython-312.pyc,,
charset_normalizer/__pycache__/md.cpython-312.pyc,,
charset_normalizer/__pycache__/models.cpython-312.pyc,,
charset_normalizer/__pycache__/utils.cpython-312.pyc,,
charset_normalizer/__pycache__/version.cpython-312.pyc,,
charset_normalizer/api.py,sha256=V07i8aVeCD8T2fSia3C-fn0i9t8qQguEBhsqszg32Ns,22668
charset_normalizer/cd.py,sha256=WKTo1HDb-H9HfCDc3Bfwq5jzS25Ziy9SE2a74SgTq88,12522
charset_normalizer/cli/__init__.py,sha256=D8I86lFk2-py45JvqxniTirSj_sFyE6sjaY_0-G1shc,136
charset_normalizer/cli/__main__.py,sha256=dMaXG6IJXRvqq8z2tig7Qb83-BpWTln55ooiku5_uvg,12646
charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc,,
charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc,,
charset_normalizer/constant.py,sha256=7UVY4ldYhmQMHUdgQ_sgZmzcQ0xxYxpBunqSZ-XJZ8U,42713
charset_normalizer/legacy.py,sha256=sYBzSpzsRrg_wF4LP536pG64BItw7Tqtc3SMQAHvFLM,2731
charset_normalizer/md.cpython-312-x86_64-linux-gnu.so,sha256=sZ7umtJLjKfA83NFJ7npkiDyr06zDT8cWtl6uIx2MsM,15912
charset_normalizer/md.py,sha256=-_oN3h3_X99nkFfqamD3yu45DC_wfk5odH0Tr_CQiXs,20145
charset_normalizer/md__mypyc.cpython-312-x86_64-linux-gnu.so,sha256=J2WWgLBQiO8sqdFsENp9u5V9uEH0tTwvTLszPdqhsv0,290584
charset_normalizer/models.py,sha256=lKXhOnIPtiakbK3i__J9wpOfzx3JDTKj7Dn3Rg0VaRI,12394
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
charset_normalizer/utils.py,sha256=sTejPgrdlNsKNucZfJCxJ95lMTLA0ShHLLE3n5wpT9Q,12170
charset_normalizer/version.py,sha256=nKE4qBNk5WA4LIJ_yIH_aSDfvtsyizkWMg-PUG-UZVk,115

View File

@@ -0,0 +1,7 @@
Wheel-Version: 1.0
Generator: setuptools (80.9.0)
Root-Is-Purelib: false
Tag: cp312-cp312-manylinux_2_17_x86_64
Tag: cp312-cp312-manylinux2014_x86_64
Tag: cp312-cp312-manylinux_2_28_x86_64

View File

@@ -0,0 +1,2 @@
[console_scripts]
normalizer = charset_normalizer.cli:cli_detect

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1 @@
charset_normalizer

View File

@@ -0,0 +1,48 @@
"""
Charset-Normalizer
~~~~~~~~~~~~~~
The Real First Universal Charset Detector.
A library that helps you read text from an unknown charset encoding.
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
All IANA character set names for which the Python core library provides codecs are supported.
Basic usage:
>>> from charset_normalizer import from_bytes
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
>>> best_guess = results.best()
>>> str(best_guess)
'Bсеки човек има право на образование. Oбразованието!'
Others methods and usages are available - see the full documentation
at <https://github.com/Ousret/charset_normalizer>.
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
from __future__ import annotations
import logging
from .api import from_bytes, from_fp, from_path, is_binary
from .legacy import detect
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__
__all__ = (
"from_fp",
"from_path",
"from_bytes",
"is_binary",
"detect",
"CharsetMatch",
"CharsetMatches",
"__version__",
"VERSION",
"set_logging_handler",
)
# Attach a NullHandler to the top level logger by default
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())

View File

@@ -0,0 +1,6 @@
from __future__ import annotations
from .cli import cli_detect
if __name__ == "__main__":
cli_detect()

View File

@@ -0,0 +1,669 @@
from __future__ import annotations
import logging
from os import PathLike
from typing import BinaryIO
from .cd import (
coherence_ratio,
encoding_languages,
mb_encoding_languages,
merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
cut_sequence_chunks,
iana_name,
identify_sig_or_bom,
is_cp_similar,
is_multi_byte_encoding,
should_strip_sig_or_bom,
)
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)
def from_bytes(
sequences: bytes | bytearray,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
If there is no results, it is a strong indicator that the source is binary/not text.
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
but never take it for granted. Can improve the performance.
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
purpose.
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
Custom logging format and handler can be set manually.
"""
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
"Expected object of type bytes or bytearray, got: {}".format(
type(sequences)
)
)
if explain:
previous_logger_level: int = logger.level
logger.addHandler(explain_handler)
logger.setLevel(TRACE)
length: int = len(sequences)
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
if cp_isolation is not None:
logger.log(
TRACE,
"cp_isolation is set. use this flag for debugging purpose. "
"limited list of encoding allowed : %s.",
", ".join(cp_isolation),
)
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
else:
cp_isolation = []
if cp_exclusion is not None:
logger.log(
TRACE,
"cp_exclusion is set. use this flag for debugging purpose. "
"limited list of encoding excluded : %s.",
", ".join(cp_exclusion),
)
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
else:
cp_exclusion = []
if length <= (chunk_size * steps):
logger.log(
TRACE,
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
steps,
chunk_size,
length,
)
steps = 1
chunk_size = length
if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps)
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
if is_too_small_sequence:
logger.log(
TRACE,
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
length
),
)
elif is_too_large_sequence:
logger.log(
TRACE,
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
length
),
)
prioritized_encodings: list[str] = []
specified_encoding: str | None = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
logger.log(
TRACE,
"Detected declarative mark in sequence. Priority +1 given for %s.",
specified_encoding,
)
tested: set[str] = set()
tested_but_hard_failure: list[str] = []
tested_but_soft_failure: list[str] = []
fallback_ascii: CharsetMatch | None = None
fallback_u8: CharsetMatch | None = None
fallback_specified: CharsetMatch | None = None
results: CharsetMatches = CharsetMatches()
early_stop_results: CharsetMatches = CharsetMatches()
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
if sig_encoding is not None:
prioritized_encodings.append(sig_encoding)
logger.log(
TRACE,
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
len(sig_payload),
sig_encoding,
)
prioritized_encodings.append("ascii")
if "utf_8" not in prioritized_encodings:
prioritized_encodings.append("utf_8")
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
if cp_isolation and encoding_iana not in cp_isolation:
continue
if cp_exclusion and encoding_iana in cp_exclusion:
continue
if encoding_iana in tested:
continue
tested.add(encoding_iana)
decoded_payload: str | None = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
)
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
)
continue
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
encoding_iana,
)
continue
try:
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError):
logger.log(
TRACE,
"Encoding %s does not provide an IncrementalDecoder",
encoding_iana,
)
continue
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)]
),
encoding=encoding_iana,
)
else:
decoded_payload = str(
(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :]
),
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
if not isinstance(e, LookupError):
logger.log(
TRACE,
"Code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
similar_soft_failure_test: bool = False
for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed):
similar_soft_failure_test = True
break
if similar_soft_failure_test:
logger.log(
TRACE,
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
encoding_iana,
encoding_soft_failed,
)
continue
r_ = range(
0 if not bom_or_sig_available else len(sig_payload),
length,
int(length / steps),
)
multi_byte_bonus: bool = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
)
if multi_byte_bonus:
logger.log(
TRACE,
"Code page %s is a multi byte encoding table and it appear that at least one character "
"was encoded using n-bytes.",
encoding_iana,
)
max_chunk_gave_up: int = int(len(r_) / 4)
max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count: int = 0
lazy_str_hard_failure = False
md_chunks: list[str] = []
md_ratios = []
try:
for chunk in cut_sequence_chunks(
sequences,
encoding_iana,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
decoded_payload,
):
md_chunks.append(chunk)
md_ratios.append(
mess_ratio(
chunk,
threshold,
explain is True and 1 <= len(cp_isolation) <= 2,
)
)
if md_ratios[-1] >= threshold:
early_stop_count += 1
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
except (
UnicodeDecodeError
) as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
if (
not lazy_str_hard_failure
and is_too_large_sequence
and not is_multi_byte_decoder
):
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.log(
TRACE,
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.log(
TRACE,
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
"Computed mean chaos is %f %%.",
encoding_iana,
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3),
)
# Preparing those fallbacks in case we got nothing.
if (
enable_fallback
and encoding_iana
in ["ascii", "utf_8", specified_encoding, "utf_16", "utf_32"]
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences,
encoding_iana,
threshold,
bom_or_sig_available,
[],
decoded_payload,
preemptive_declaration=specified_encoding,
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
elif encoding_iana == "ascii":
fallback_ascii = fallback_entry
else:
fallback_u8 = fallback_entry
continue
logger.log(
TRACE,
"%s passed initial chaos probing. Mean measured chaos is %f %%",
encoding_iana,
round(mean_mess_ratio * 100, ndigits=3),
)
if not is_multi_byte_decoder:
target_languages: list[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
if target_languages:
logger.log(
TRACE,
"{} should target any language(s) of {}".format(
encoding_iana, str(target_languages)
),
)
cd_ratios = []
# We shall skip the CD when its about ASCII
# Most of the time its not relevant to run "language-detection" on it.
if encoding_iana != "ascii":
for chunk in md_chunks:
chunk_languages = coherence_ratio(
chunk,
language_threshold,
",".join(target_languages) if target_languages else None,
)
cd_ratios.append(chunk_languages)
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
if cd_ratios_merged:
logger.log(
TRACE,
"We detected language {} using {}".format(
cd_ratios_merged, encoding_iana
),
)
current_match = CharsetMatch(
sequences,
encoding_iana,
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
(
decoded_payload
if (
is_too_large_sequence is False
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
)
else None
),
preemptive_declaration=specified_encoding,
)
results.append(current_match)
if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
# If md says nothing to worry about, then... stop immediately!
if mean_mess_ratio == 0.0:
logger.debug(
"Encoding detection: %s is most likely the one.",
current_match.encoding,
)
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([current_match])
early_stop_results.append(current_match)
if (
len(early_stop_results)
and (specified_encoding is None or specified_encoding in tested)
and "ascii" in tested
and "utf_8" in tested
):
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
logger.debug(
"Encoding detection: %s is most likely the one.",
probable_result.encoding,
)
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([probable_result])
if encoding_iana == sig_encoding:
logger.debug(
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
"the beginning of the sequence.",
encoding_iana,
)
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if len(results) == 0:
if fallback_u8 or fallback_ascii or fallback_specified:
logger.log(
TRACE,
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
)
if fallback_specified:
logger.debug(
"Encoding detection: %s will be used as a fallback match",
fallback_specified.encoding,
)
results.append(fallback_specified)
elif (
(fallback_u8 and fallback_ascii is None)
or (
fallback_u8
and fallback_ascii
and fallback_u8.fingerprint != fallback_ascii.fingerprint
)
or (fallback_u8 is not None)
):
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
logger.debug("Encoding detection: ascii will be used as a fallback match")
results.append(fallback_ascii)
if results:
logger.debug(
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
results.best().encoding, # type: ignore
len(results) - 1,
)
else:
logger.debug("Encoding detection: Unable to determine any suitable charset.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return results
def from_fp(
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
Will not close the file pointer.
"""
return from_bytes(
fp.read(),
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def from_path(
path: str | bytes | PathLike, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Can raise IOError.
"""
with open(path, "rb") as fp:
return from_fp(
fp,
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def is_binary(
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = False,
) -> bool:
"""
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
"""
if isinstance(fp_or_path_or_payload, (str, PathLike)):
guesses = from_path(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
elif isinstance(
fp_or_path_or_payload,
(
bytes,
bytearray,
),
):
guesses = from_bytes(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
else:
guesses = from_fp(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
return not guesses

View File

@@ -0,0 +1,395 @@
from __future__ import annotations
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Counter as TypeCounter
from .constant import (
FREQUENCIES,
KO_NAMES,
LANGUAGE_SUPPORTED_COUNT,
TOO_SMALL_SEQUENCE,
ZH_NAMES,
)
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)
def encoding_unicode_range(iana_name: str) -> list[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
raise OSError("Function not supported on multi-byte code page")
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
character_range: str | None = unicode_range(chunk)
if character_range is None:
continue
if is_unicode_range_secondary(character_range) is False:
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
]
)
def unicode_range_languages(primary_range: str) -> list[str]:
"""
Return inferred languages used with a unicode range.
"""
languages: list[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
if unicode_range(character) == primary_range:
languages.append(language)
break
return languages
@lru_cache()
def encoding_languages(iana_name: str) -> list[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
primary_range: str | None = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
primary_range = specified_range
break
if primary_range is None:
return ["Latin Based"]
return unicode_range_languages(primary_range)
@lru_cache()
def mb_encoding_languages(iana_name: str) -> list[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
if (
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]
return []
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
def get_target_features(language: str) -> tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
target_have_accents: bool = False
target_pure_latin: bool = True
for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character):
target_have_accents = True
if target_pure_latin and is_latin(character) is False:
target_pure_latin = False
return target_have_accents, target_pure_latin
def alphabet_languages(
characters: list[str], ignore_non_latin: bool = False
) -> list[str]:
"""
Return associated languages associated to given characters.
"""
languages: list[tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
for language, language_characters in FREQUENCIES.items():
target_have_accents, target_pure_latin = get_target_features(language)
if ignore_non_latin and target_pure_latin is False:
continue
if target_have_accents is False and source_have_accents:
continue
character_count: int = len(language_characters)
character_match_count: int = len(
[c for c in language_characters if c in characters]
)
ratio: float = character_match_count / character_count
if ratio >= 0.2:
languages.append((language, ratio))
languages = sorted(languages, key=lambda x: x[1], reverse=True)
return [compatible_language[0] for compatible_language in languages]
def characters_popularity_compare(
language: str, ordered_characters: list[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
raise ValueError(f"{language} not available")
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
ordered_characters_count: int = len(ordered_characters)
target_language_characters_count: int = len(FREQUENCIES[language])
large_alphabet: bool = target_language_characters_count > 26
for character, character_rank in zip(
ordered_characters, range(0, ordered_characters_count)
):
if character not in FREQUENCIES_language_set:
continue
character_rank_in_language: int = FREQUENCIES[language].index(character)
expected_projection_ratio: float = (
target_language_characters_count / ordered_characters_count
)
character_rank_projection: int = int(character_rank * expected_projection_ratio)
if (
large_alphabet is False
and abs(character_rank_projection - character_rank_in_language) > 4
):
continue
if (
large_alphabet is True
and abs(character_rank_projection - character_rank_in_language)
< target_language_characters_count / 3
):
character_approved_count += 1
continue
characters_before_source: list[str] = FREQUENCIES[language][
0:character_rank_in_language
]
characters_after_source: list[str] = FREQUENCIES[language][
character_rank_in_language:
]
characters_before: list[str] = ordered_characters[0:character_rank]
characters_after: list[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
)
after_match_count: int = len(
set(characters_after) & set(characters_after_source)
)
if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
continue
if len(characters_after_source) == 0 and after_match_count <= 4:
character_approved_count += 1
continue
if (
before_match_count / len(characters_before_source) >= 0.4
or after_match_count / len(characters_after_source) >= 0.4
):
character_approved_count += 1
continue
return character_approved_count / len(ordered_characters)
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers: dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
character_range: str | None = unicode_range(character)
if character_range is None:
continue
layer_target_range: str | None = None
for discovered_range in layers:
if (
is_suspiciously_successive_range(discovered_range, character_range)
is False
):
layer_target_range = discovered_range
break
if layer_target_range is None:
layer_target_range = character_range
if layer_target_range not in layers:
layers[layer_target_range] = character.lower()
continue
layers[layer_target_range] += character.lower()
return list(layers.values())
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios: dict[str, list[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
if language not in per_language_ratios:
per_language_ratios[language] = [ratio]
continue
per_language_ratios[language].append(ratio)
merge = [
(
language,
round(
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
4,
),
)
for language in per_language_ratios
]
return sorted(merge, key=lambda x: x[1], reverse=True)
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
"""
We shall NOT return "English—" in CoherenceMatches because it is an alternative
of "English". This function only keeps the best match and remove the em-dash in it.
"""
index_results: dict[str, list[float]] = dict()
for result in results:
language, ratio = result
no_em_name: str = language.replace("", "")
if no_em_name not in index_results:
index_results[no_em_name] = []
index_results[no_em_name].append(ratio)
if any(len(index_results[e]) > 1 for e in index_results):
filtered_results: CoherenceMatches = []
for language in index_results:
filtered_results.append((language, max(index_results[language])))
return filtered_results
return results
@lru_cache(maxsize=2048)
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
results: list[tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies: TypeCounter[str] = Counter(layer)
most_common = sequence_frequencies.most_common()
character_count: int = sum(o for c, o in most_common)
if character_count <= TOO_SMALL_SEQUENCE:
continue
popular_character_ordered: list[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
ratio: float = characters_popularity_compare(
language, popular_character_ordered
)
if ratio < threshold:
continue
elif ratio >= 0.8:
sufficient_match_count += 1
results.append((language, round(ratio, 4)))
if sufficient_match_count >= 3:
break
return sorted(
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
)

View File

@@ -0,0 +1,8 @@
from __future__ import annotations
from .__main__ import cli_detect, query_yes_no
__all__ = (
"cli_detect",
"query_yes_no",
)

View File

@@ -0,0 +1,381 @@
from __future__ import annotations
import argparse
import sys
import typing
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
from unicodedata import unidata_version
import charset_normalizer.md as md_module
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
def query_yes_no(question: str, default: str = "yes") -> bool:
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
"""
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
class FileType:
"""Factory for creating file object types
Instances of FileType are typically passed as type= arguments to the
ArgumentParser add_argument() method.
Keyword Arguments:
- mode -- A string indicating how the file is to be opened. Accepts the
same values as the builtin open() function.
- bufsize -- The file's desired buffer size. Accepts the same values as
the builtin open() function.
- encoding -- The file's encoding. Accepts the same values as the
builtin open() function.
- errors -- A string indicating how encoding and decoding errors are to
be handled. Accepts the same value as the builtin open() function.
Backported from CPython 3.12
"""
def __init__(
self,
mode: str = "r",
bufsize: int = -1,
encoding: str | None = None,
errors: str | None = None,
):
self._mode = mode
self._bufsize = bufsize
self._encoding = encoding
self._errors = errors
def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
# the special argument "-" means sys.std{in,out}
if string == "-":
if "r" in self._mode:
return sys.stdin.buffer if "b" in self._mode else sys.stdin
elif any(c in self._mode for c in "wax"):
return sys.stdout.buffer if "b" in self._mode else sys.stdout
else:
msg = f'argument "-" with mode {self._mode}'
raise ValueError(msg)
# all other arguments are used as file names
try:
return open(string, self._mode, self._bufsize, self._encoding, self._errors)
except OSError as e:
message = f"can't open '{string}': {e}"
raise argparse.ArgumentTypeError(message)
def __repr__(self) -> str:
args = self._mode, self._bufsize
kwargs = [("encoding", self._encoding), ("errors", self._errors)]
args_str = ", ".join(
[repr(arg) for arg in args if arg != -1]
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
)
return f"{type(self).__name__}({args_str})"
def cli_detect(argv: list[str] | None = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
:return: 0 if everything is fine, anything else equal trouble
"""
parser = argparse.ArgumentParser(
description="The Real First Universal Charset Detector. "
"Discover originating encoding used on text file. "
"Normalize text to unicode."
)
parser.add_argument(
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Display complementary information about file if any. "
"Stdout will contain logs about the detection process.",
)
parser.add_argument(
"-a",
"--with-alternative",
action="store_true",
default=False,
dest="alternatives",
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
)
parser.add_argument(
"-n",
"--normalize",
action="store_true",
default=False,
dest="normalize",
help="Permit to normalize input file. If not set, program does not write anything.",
)
parser.add_argument(
"-m",
"--minimal",
action="store_true",
default=False,
dest="minimal",
help="Only output the charset detected to STDOUT. Disabling JSON output.",
)
parser.add_argument(
"-r",
"--replace",
action="store_true",
default=False,
dest="replace",
help="Replace file when trying to normalize it instead of creating a new one.",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
default=False,
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-i",
"--no-preemptive",
action="store_true",
default=False,
dest="no_preemptive",
help="Disable looking at a charset declaration to hint the detector.",
)
parser.add_argument(
"-t",
"--threshold",
action="store",
default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
)
parser.add_argument(
"--version",
action="version",
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
__version__,
python_version(),
unidata_version,
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
),
help="Show version information and exit.",
)
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0.0 or args.threshold > 1.0:
if args.files:
for my_file in args.files:
my_file.close()
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
matches = from_fp(
my_file,
threshold=args.threshold,
explain=args.verbose,
preemptive_behaviour=args.no_preemptive is False,
)
best_guess = matches.best()
if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
(
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else ""
),
),
file=sys.stderr,
)
x_.append(
CliDetectionResult(
abspath(my_file.name),
None,
[],
[],
"Unknown",
[],
False,
1.0,
0.0,
None,
True,
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
)
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != best_guess:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[
cp
for cp in el.could_be_from_charset
if cp != el.encoding
],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False,
)
)
if args.normalize is True:
if best_guess.encoding.startswith("utf") is True:
print(
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
my_file.name
),
file=sys.stderr,
)
if my_file.closed is False:
my_file.close()
continue
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
o_: list[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))
with open(x_[0].unicode_path, "wb") as fp:
fp.write(best_guess.output())
except OSError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if my_file.closed is False:
my_file.close()
if args.minimal is False:
print(
dumps(
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4,
)
)
else:
for my_file in args.files:
print(
", ".join(
[
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
)
)
return 0
if __name__ == "__main__":
cli_detect()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,80 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any
from warnings import warn
from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
# TODO: remove this check when dropping Python 3.7 support
if TYPE_CHECKING:
from typing_extensions import TypedDict
class ResultDict(TypedDict):
encoding: str | None
language: str
confidence: float | None
def detect(
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
) -> ResultDict:
"""
chardet legacy method
Detect the encoding of the given byte string. It should be mostly backward-compatible.
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
This function is deprecated and should be used to migrate your project easily, consult the documentation for
further information. Not planned for removal.
:param byte_str: The byte sequence to examine.
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
"""
if len(kwargs):
warn(
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
)
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
)
if isinstance(byte_str, bytearray):
byte_str = bytes(byte_str)
r = from_bytes(byte_str).best()
encoding = r.encoding if r is not None else None
language = r.language if r is not None and r.language != "Unknown" else ""
confidence = 1.0 - r.chaos if r is not None else None
# automatically lower confidence
# on small bytes samples.
# https://github.com/jawah/charset_normalizer/issues/391
if (
confidence is not None
and confidence >= 0.9
and encoding
not in {
"utf_8",
"ascii",
}
and r.bom is False # type: ignore[union-attr]
and len(byte_str) < TOO_SMALL_SEQUENCE
):
confidence -= 0.2
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
# but chardet does return 'utf-8-sig' and it is a valid codec name.
if r is not None and encoding == "utf_8" and r.bom:
encoding += "_sig"
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
encoding = CHARDET_CORRESPONDENCE[encoding]
return {
"encoding": encoding,
"language": language,
"confidence": confidence,
}

View File

@@ -0,0 +1,635 @@
from __future__ import annotations
from functools import lru_cache
from logging import getLogger
from .constant import (
COMMON_SAFE_ASCII_CHARACTERS,
TRACE,
UNICODE_SECONDARY_RANGE_KEYWORD,
)
from .utils import (
is_accentuated,
is_arabic,
is_arabic_isolated_form,
is_case_variable,
is_cjk,
is_emoticon,
is_hangul,
is_hiragana,
is_katakana,
is_latin,
is_punctuation,
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
is_cjk_uncommon,
)
class MessDetectorPlugin:
"""
Base abstract class used for mess detection plugins.
All detectors MUST extend and implement given methods.
"""
def eligible(self, character: str) -> bool:
"""
Determine if given character should be fed in.
"""
raise NotImplementedError # pragma: nocover
def feed(self, character: str) -> None:
"""
The main routine to be executed upon character.
Insert the logic in witch the text would be considered chaotic.
"""
raise NotImplementedError # pragma: nocover
def reset(self) -> None: # pragma: no cover
"""
Permit to reset the plugin to the initial state.
"""
raise NotImplementedError
@property
def ratio(self) -> float:
"""
Compute the chaos ratio based on what your feed() has seen.
Must NOT be lower than 0.; No restriction gt 0.
"""
raise NotImplementedError # pragma: nocover
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._punctuation_count: int = 0
self._symbol_count: int = 0
self._character_count: int = 0
self._last_printable_char: str | None = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character != self._last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
):
if is_punctuation(character):
self._punctuation_count += 1
elif (
character.isdigit() is False
and is_symbol(character)
and is_emoticon(character) is False
):
self._symbol_count += 2
self._last_printable_char = character
def reset(self) -> None: # Abstract
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
ratio_of_punctuation: float = (
self._punctuation_count + self._symbol_count
) / self._character_count
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._accentuated_count: int = 0
def eligible(self, character: str) -> bool:
return character.isalpha()
def feed(self, character: str) -> None:
self._character_count += 1
if is_accentuated(character):
self._accentuated_count += 1
def reset(self) -> None: # Abstract
self._character_count = 0
self._accentuated_count = 0
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._unprintable_count: int = 0
self._character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1
def reset(self) -> None: # Abstract
self._unprintable_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._unprintable_count * 8) / self._character_count
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._successive_count: int = 0
self._character_count: int = 0
self._last_latin_character: str | None = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
def feed(self, character: str) -> None:
self._character_count += 1
if (
self._last_latin_character is not None
and is_accentuated(character)
and is_accentuated(self._last_latin_character)
):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character
def reset(self) -> None: # Abstract
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._successive_count * 2) / self._character_count
class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
self._last_printable_seen: str | None = None
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character.isspace()
or is_punctuation(character)
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
if self._last_printable_seen is None:
self._last_printable_seen = character
return
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
unicode_range_b: str | None = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
def reset(self) -> None: # Abstract
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@property
def ratio(self) -> float:
if self._character_count <= 13:
return 0.0
ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count
return ratio_of_suspicious_range_usage
class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._word_count: int = 0
self._bad_word_count: int = 0
self._foreign_long_count: int = 0
self._is_current_word_bad: bool = False
self._foreign_long_watch: bool = False
self._character_count: int = 0
self._bad_character_count: int = 0
self._buffer: str = ""
self._buffer_accent_count: int = 0
self._buffer_glyph_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character.isalpha():
self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and (is_latin(character) is False or is_accentuated(character))
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
and is_hiragana(character) is False
and is_thai(character) is False
):
self._foreign_long_watch = True
if (
is_cjk(character)
or is_hangul(character)
or is_katakana(character)
or is_hiragana(character)
or is_thai(character)
):
self._buffer_glyph_count += 1
return
if not self._buffer:
return
if (
character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer:
self._word_count += 1
buffer_length: int = len(self._buffer)
self._character_count += buffer_length
if buffer_length >= 4:
if self._buffer_accent_count / buffer_length >= 0.5:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
elif (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
elif self._buffer_glyph_count == 1:
self._is_current_word_bad = True
self._foreign_long_count += 1
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
for c, i in zip(self._buffer, range(0, buffer_length))
if c.isupper()
]
probable_camel_cased: bool = False
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
probable_camel_cased = True
if not probable_camel_cased:
self._foreign_long_count += 1
self._is_current_word_bad = True
if self._is_current_word_bad:
self._bad_word_count += 1
self._bad_character_count += len(self._buffer)
self._is_current_word_bad = False
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
self._buffer_glyph_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
and is_symbol(character)
):
self._is_current_word_bad = True
self._buffer += character
def reset(self) -> None: # Abstract
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
self._bad_word_count = 0
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0
self._foreign_long_count = 0
@property
def ratio(self) -> float:
if self._word_count <= 10 and self._foreign_long_count == 0:
return 0.0
return self._bad_character_count / self._character_count
class CjkUncommonPlugin(MessDetectorPlugin):
"""
Detect messy CJK text that probably means nothing.
"""
def __init__(self) -> None:
self._character_count: int = 0
self._uncommon_count: int = 0
def eligible(self, character: str) -> bool:
return is_cjk(character)
def feed(self, character: str) -> None:
self._character_count += 1
if is_cjk_uncommon(character):
self._uncommon_count += 1
return
def reset(self) -> None: # Abstract
self._character_count = 0
self._uncommon_count = 0
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
uncommon_form_usage: float = self._uncommon_count / self._character_count
# we can be pretty sure it's garbage when uncommon characters are widely
# used. otherwise it could just be traditional chinese for example.
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._buf: bool = False
self._character_count_since_last_sep: int = 0
self._successive_upper_lower_count: int = 0
self._successive_upper_lower_count_final: int = 0
self._character_count: int = 0
self._last_alpha_seen: str | None = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
is_concerned = character.isalpha() and is_case_variable(character)
chunk_sep = is_concerned is False
if chunk_sep and self._character_count_since_last_sep > 0:
if (
self._character_count_since_last_sep <= 64
and character.isdigit() is False
and self._current_ascii_only is False
):
self._successive_upper_lower_count_final += (
self._successive_upper_lower_count
)
self._successive_upper_lower_count = 0
self._character_count_since_last_sep = 0
self._last_alpha_seen = None
self._buf = False
self._character_count += 1
self._current_ascii_only = True
return
if self._current_ascii_only is True and character.isascii() is False:
self._current_ascii_only = False
if self._last_alpha_seen is not None:
if (character.isupper() and self._last_alpha_seen.islower()) or (
character.islower() and self._last_alpha_seen.isupper()
):
if self._buf is True:
self._successive_upper_lower_count += 2
self._buf = False
else:
self._buf = True
else:
self._buf = False
self._character_count += 1
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
def reset(self) -> None: # Abstract
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
self._successive_upper_lower_count_final = 0
self._last_alpha_seen = None
self._buf = False
self._current_ascii_only = True
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return self._successive_upper_lower_count_final / self._character_count
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._isolated_form_count: int = 0
def reset(self) -> None: # Abstract
self._character_count = 0
self._isolated_form_count = 0
def eligible(self, character: str) -> bool:
return is_arabic(character)
def feed(self, character: str) -> None:
self._character_count += 1
if is_arabic_isolated_form(character):
self._isolated_form_count += 1
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
isolated_form_usage: float = self._isolated_form_count / self._character_count
return isolated_form_usage
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: str | None, unicode_range_b: str | None
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
"""
if unicode_range_a is None or unicode_range_b is None:
return True
if unicode_range_a == unicode_range_b:
return False
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
return False
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
return False
# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
"Combining" in unicode_range_a or "Combining" in unicode_range_b
):
return False
keywords_range_a, keywords_range_b = (
unicode_range_a.split(" "),
unicode_range_b.split(" "),
)
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
continue
if el in keywords_range_b:
return False
# Japanese Exception
range_a_jp_chars, range_b_jp_chars = (
unicode_range_a
in (
"Hiragana",
"Katakana",
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if (range_a_jp_chars or range_b_jp_chars) and (
"CJK" in unicode_range_a or "CJK" in unicode_range_b
):
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
# Chinese/Japanese use dedicated range for punctuation and/or separators.
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
unicode_range_a in ["Katakana", "Hiragana"]
and unicode_range_b in ["Katakana", "Hiragana"]
):
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
return True
@lru_cache(maxsize=2048)
def mess_ratio(
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors: list[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
length: int = len(decoded_sequence) + 1
mean_mess_ratio: float = 0.0
if length < 512:
intermediary_mean_mess_ratio_calc: int = 32
elif length <= 1024:
intermediary_mean_mess_ratio_calc = 64
else:
intermediary_mean_mess_ratio_calc = 128
for character, index in zip(decoded_sequence + "\n", range(length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
if (
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
) or index == length - 1:
mean_mess_ratio = sum(dt.ratio for dt in detectors)
if mean_mess_ratio >= maximum_threshold:
break
if debug:
logger = getLogger("charset_normalizer")
logger.log(
TRACE,
"Mess-detector extended-analysis start. "
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
f"maximum_threshold={maximum_threshold}",
)
if len(decoded_sequence) > 16:
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
for dt in detectors:
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)

View File

@@ -0,0 +1,360 @@
from __future__ import annotations
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import sub
from typing import Any, Iterator, List, Tuple
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
class CharsetMatch:
def __init__(
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: CoherenceMatches,
decoded_payload: str | None = None,
preemptive_declaration: str | None = None,
):
self._payload: bytes = payload
self._encoding: str = guessed_encoding
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges: list[str] | None = None
self._leaves: list[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
self._output_payload: bytes | None = None
self._output_encoding: str | None = None
self._string: str | None = decoded_payload
self._preemptive_declaration: str | None = preemptive_declaration
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
if isinstance(other, str):
return iana_name(other) == self.encoding
return False
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other: object) -> bool:
"""
Implemented to make sorted available upon CharsetMatches items.
"""
if not isinstance(other, CharsetMatch):
raise ValueError
chaos_difference: float = abs(self.chaos - other.chaos)
coherence_difference: float = abs(self.coherence - other.coherence)
# Below 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02:
return self.coherence > other.coherence
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
# preserve RAM usage!
if len(self._payload) >= TOO_BIG_SEQUENCE:
return self.chaos < other.chaos
return self.multi_byte_usage > other.multi_byte_usage
return self.chaos < other.chaos
@property
def multi_byte_usage(self) -> float:
return 1.0 - (len(str(self)) / len(self.raw))
def __str__(self) -> str:
# Lazy Str Loading
if self._string is None:
self._string = str(self._payload, self._encoding, "strict")
return self._string
def __repr__(self) -> str:
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
def add_submatch(self, other: CharsetMatch) -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
other.__class__
)
)
other._string = None # Unload RAM usage; dirty trick.
self._leaves.append(other)
@property
def encoding(self) -> str:
return self._encoding
@property
def encoding_aliases(self) -> list[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
also_known_as: list[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
elif self.encoding == p:
also_known_as.append(u)
return also_known_as
@property
def bom(self) -> bool:
return self._has_sig_or_bom
@property
def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
def languages(self) -> list[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
"""
return [e[0] for e in self._languages]
@property
def language(self) -> str:
"""
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
"Unknown".
"""
if not self._languages:
# Trying to infer the language based on the given encoding
# Its either English or we should not pronounce ourselves in certain cases.
if "ascii" in self.could_be_from_charset:
return "English"
# doing it there to avoid circular import
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
languages = (
mb_encoding_languages(self.encoding)
if is_multi_byte_encoding(self.encoding)
else encoding_languages(self.encoding)
)
if len(languages) == 0 or "Latin Based" in languages:
return "Unknown"
return languages[0]
return self._languages[0][0]
@property
def chaos(self) -> float:
return self._mean_mess_ratio
@property
def coherence(self) -> float:
if not self._languages:
return 0.0
return self._languages[0][1]
@property
def percent_chaos(self) -> float:
return round(self.chaos * 100, ndigits=3)
@property
def percent_coherence(self) -> float:
return round(self.coherence * 100, ndigits=3)
@property
def raw(self) -> bytes:
"""
Original untouched bytes.
"""
return self._payload
@property
def submatch(self) -> list[CharsetMatch]:
return self._leaves
@property
def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
def alphabets(self) -> list[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
def could_be_from_charset(self) -> list[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
This list does include the encoding available in property 'encoding'.
"""
return [self._encoding] + [m.encoding for m in self._leaves]
def output(self, encoding: str = "utf_8") -> bytes:
"""
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
Any errors will be simply ignored by the encoder NOT replaced.
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
decoded_string = str(self)
if (
self._preemptive_declaration is not None
and self._preemptive_declaration.lower()
not in ["utf-8", "utf8", "utf_8"]
):
patched_header = sub(
RE_POSSIBLE_ENCODING_INDICATION,
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
m.groups()[0],
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
),
decoded_string[:8192],
count=1,
)
decoded_string = patched_header + decoded_string[8192:]
self._output_payload = decoded_string.encode(encoding, "replace")
return self._output_payload # type: ignore
@property
def fingerprint(self) -> str:
"""
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
"""
return sha256(self.output()).hexdigest()
class CharsetMatches:
"""
Container with every CharsetMatch items ordered by default from most probable to the less one.
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: list[CharsetMatch] | None = None):
self._results: list[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
def __getitem__(self, item: int | str) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
"""
if isinstance(item, int):
return self._results[item]
if isinstance(item, str):
item = iana_name(item, False)
for result in self._results:
if item in result.could_be_from_charset:
return result
raise KeyError
def __len__(self) -> int:
return len(self._results)
def __bool__(self) -> bool:
return len(self._results) > 0
def append(self, item: CharsetMatch) -> None:
"""
Insert a single match. Will be inserted accordingly to preserve sort.
Can be inserted as a submatch.
"""
if not isinstance(item, CharsetMatch):
raise ValueError(
"Cannot append instance '{}' to CharsetMatches".format(
str(item.__class__)
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) < TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
self._results = sorted(self._results)
def best(self) -> CharsetMatch | None:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
if not self._results:
return None
return self._results[0]
def first(self) -> CharsetMatch | None:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
return self.best()
CoherenceMatch = Tuple[str, float]
CoherenceMatches = List[CoherenceMatch]
class CliDetectionResult:
def __init__(
self,
path: str,
encoding: str | None,
encoding_aliases: list[str],
alternative_encodings: list[str],
language: str,
alphabets: list[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: str | None,
is_preferred: bool,
):
self.path: str = path
self.unicode_path: str | None = unicode_path
self.encoding: str | None = encoding
self.encoding_aliases: list[str] = encoding_aliases
self.alternative_encodings: list[str] = alternative_encodings
self.language: str = language
self.alphabets: list[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
"encoding_aliases": self.encoding_aliases,
"alternative_encodings": self.alternative_encodings,
"language": self.language,
"alphabets": self.alphabets,
"has_sig_or_bom": self.has_sig_or_bom,
"chaos": self.chaos,
"coherence": self.coherence,
"unicode_path": self.unicode_path,
"is_preferred": self.is_preferred,
}
def to_json(self) -> str:
return dumps(self.__dict__, ensure_ascii=True, indent=4)

View File

@@ -0,0 +1,414 @@
from __future__ import annotations
import importlib
import logging
import unicodedata
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import Generator
from _multibytecodec import ( # type: ignore[import-not-found,import]
MultibyteIncrementalDecoder,
)
from .constant import (
ENCODING_MARKS,
IANA_SUPPORTED_SIMILAR,
RE_POSSIBLE_ENCODING_INDICATION,
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
COMMON_CJK_CHARACTERS,
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return (
"WITH GRAVE" in description
or "WITH ACUTE" in description
or "WITH CEDILLA" in description
or "WITH DIAERESIS" in description
or "WITH CIRCUMFLEX" in description
or "WITH TILDE" in description
or "WITH MACRON" in description
or "WITH RING ABOVE" in description
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
decomposed: str = unicodedata.decomposition(character)
if not decomposed:
return character
codes: list[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> str | None:
"""
Retrieve the Unicode range official name from a single character.
"""
character_ord: int = ord(character)
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
if character_ord in ord_range:
return range_name
return None
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "P" in character_category:
return True
character_range: str | None = unicode_range(character)
if character_range is None:
return False
return "Punctuation" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "S" in character_category or "N" in character_category:
return True
character_range: str | None = unicode_range(character)
if character_range is None:
return False
return "Forms" in character_range and character_category != "Lo"
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
character_range: str | None = unicode_range(character)
if character_range is None:
return False
return "Emoticons" in character_range or "Pictographs" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in {"", "+", "<", ">"}:
return True
character_category: str = unicodedata.category(character)
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
return character.islower() != character.isupper()
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "CJK" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "HIRAGANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "KATAKANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "HANGUL" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "THAI" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk_uncommon(character: str) -> bool:
return character not in COMMON_CJK_CHARACTERS
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1a" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
if not isinstance(sequence, bytes):
raise TypeError
seq_len: int = len(sequence)
results: list[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
if len(results) == 0:
return None
for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding:
return encoding_iana
if encoding_iana == specified_encoding:
return encoding_iana
return None
@lru_cache(maxsize=128)
def is_multi_byte_encoding(name: str) -> bool:
"""
Verify is a specific encoding is a multi byte one based on it IANA name
"""
return name in {
"utf_8",
"utf_8_sig",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_le",
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
MultibyteIncrementalDecoder,
)
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
for mark in marks:
if sequence.startswith(mark):
return iana_encoding, mark
return None, b""
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
return iana_encoding not in {"utf_16", "utf_32"}
def iana_name(cp_name: str, strict: bool = True) -> str:
"""Returns the Python normalized encoding name (Not the IANA official name)."""
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana
if strict:
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
return cp_name
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
character_match_count: int = 0
for i in range(255):
to_be_decoded: bytes = bytes([i])
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
return character_match_count / 254
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
"""
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
the function cp_similarity.
"""
return (
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)
def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:
logger = logging.getLogger(name)
logger.setLevel(level)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
def cut_sequence_chunks(
sequences: bytes,
encoding_iana: str,
offsets: range,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: str | None = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in offsets:
chunk_end = i + chunk_size
if chunk_end > len(sequences) + 8:
continue
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0:
chunk_partial_size_chk: int = min(chunk_size, 16)
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
yield chunk

View File

@@ -0,0 +1,8 @@
"""
Expose version
"""
from __future__ import annotations
__version__ = "3.4.4"
VERSION = __version__.split(".")

Some files were not shown because too many files have changed in this diff Show More