# Copyright (c) the purl authors # SPDX-License-Identifier: MIT # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Visit https://github.com/package-url/packageurl-python for support and # download. from __future__ import annotations import dataclasses import re import string from collections import namedtuple from collections.abc import Mapping from dataclasses import dataclass from enum import Enum from typing import TYPE_CHECKING from typing import Any from typing import Optional from typing import Union from typing import overload from urllib.parse import quote as _percent_quote from urllib.parse import unquote as _percent_unquote from urllib.parse import urlsplit as _urlsplit from packageurl.contrib.route import NoRouteAvailable if TYPE_CHECKING: from collections.abc import Callable from collections.abc import Iterable from typing import ClassVar from typing_extensions import Literal from typing_extensions import Self AnyStr = Union[str, bytes] # Python 3 basestring = (bytes, str) """ A purl (aka. Package URL) implementation as specified at: https://github.com/package-url/purl-spec """ class ValidationSeverity(str, Enum): ERROR = "error" WARNING = "warning" INFO = "info" @dataclass class ValidationMessage: severity: ValidationSeverity message: str to_dict = dataclasses.asdict def quote(s: AnyStr) -> str: """ Return a percent-encoded unicode string, except for colon :, given an `s` byte or unicode string. """ s_bytes = s.encode("utf-8") if isinstance(s, str) else s quoted = _percent_quote(s_bytes) if not isinstance(quoted, str): quoted = quoted.decode("utf-8") quoted = quoted.replace("%3A", ":") return quoted def unquote(s: AnyStr) -> str: """ Return a percent-decoded unicode string, given an `s` byte or unicode string. """ unquoted = _percent_unquote(s) if not isinstance(unquoted, str): unquoted = unquoted.decode("utf-8") return unquoted @overload def get_quoter(encode: bool = True) -> Callable[[AnyStr], str]: ... @overload def get_quoter(encode: None) -> Callable[[str], str]: ... def get_quoter(encode: bool | None = True) -> Callable[[AnyStr], str] | Callable[[str], str]: """ Return quoting callable given an `encode` tri-boolean (True, False or None) """ if encode is True: return quote elif encode is False: return unquote elif encode is None: return lambda x: x def normalize_type(type: AnyStr | None, encode: bool | None = True) -> str | None: if not type: return None type_str = type if isinstance(type, str) else type.decode("utf-8") quoter = get_quoter(encode) type_str = quoter(type_str) return type_str.strip().lower() or None def normalize_namespace( namespace: AnyStr | None, ptype: str | None, encode: bool | None = True ) -> str | None: if not namespace: return None namespace_str = namespace if isinstance(namespace, str) else namespace.decode("utf-8") namespace_str = namespace_str.strip().strip("/") if ptype in ( "bitbucket", "github", "pypi", "gitlab", "composer", "luarocks", "qpkg", "alpm", "apk", "hex", ): namespace_str = namespace_str.lower() if ptype and ptype in ("cpan"): namespace_str = namespace_str.upper() segments = [seg for seg in namespace_str.split("/") if seg.strip()] segments_quoted = map(get_quoter(encode), segments) return "/".join(segments_quoted) or None def normalize_mlflow_name( name_str: str, qualifiers: Union[str, bytes, dict[str, str], None], ) -> Optional[str]: """MLflow purl names are case-sensitive for Azure ML, it is case sensitive and must be kept as-is in the package URL For Databricks, it is case insensitive and must be lowercased in the package URL""" if isinstance(qualifiers, dict): repo_url = qualifiers.get("repository_url") if repo_url and "azureml" in repo_url.lower(): return name_str if repo_url and "databricks" in repo_url.lower(): return name_str.lower() if isinstance(qualifiers, str): if "azureml" in qualifiers.lower(): return name_str if "databricks" in qualifiers.lower(): return name_str.lower() return name_str def normalize_name( name: AnyStr | None, qualifiers: Union[Union[str, bytes], dict[str, str], None], ptype: str | None, encode: bool | None = True, ) -> Optional[str]: if not name: return None name_str = name if isinstance(name, str) else name.decode("utf-8") quoter = get_quoter(encode) name_str = quoter(name_str) name_str = name_str.strip().strip("/") if ptype and ptype in ("mlflow"): return normalize_mlflow_name(name_str, qualifiers) if ptype in ( "bitbucket", "github", "pypi", "gitlab", "composer", "luarocks", "oci", "npm", "alpm", "apk", "bitnami", "hex", "pub", ): name_str = name_str.lower() if ptype == "pypi": name_str = name_str.replace("_", "-").lower() if ptype == "hackage": name_str = name_str.replace("_", "-") if ptype == "pub": name_str = re.sub(r"[^a-z0-9]", "_", name_str.lower()) return name_str or None def normalize_version( version: AnyStr | None, ptype: Optional[Union[str, bytes]], encode: bool | None = True ) -> str | None: if not version: return None version_str = version if isinstance(version, str) else version.decode("utf-8") quoter = get_quoter(encode) version_str = quoter(version_str.strip()) if ptype and isinstance(ptype, str) and ptype in ("huggingface", "oci"): return version_str.lower() return version_str or None @overload def normalize_qualifiers( qualifiers: AnyStr | dict[str, str] | None, encode: Literal[True] = ... ) -> str | None: ... @overload def normalize_qualifiers( qualifiers: AnyStr | dict[str, str] | None, encode: Literal[False] | None ) -> dict[str, str]: ... @overload def normalize_qualifiers( qualifiers: AnyStr | dict[str, str] | None, encode: bool | None = ... ) -> str | dict[str, str] | None: ... def normalize_qualifiers( qualifiers: AnyStr | dict[str, str] | None, encode: bool | None = True ) -> str | dict[str, str] | None: """ Return normalized `qualifiers` as a mapping (or as a string if `encode` is True). The `qualifiers` arg is either a mapping or a string. Always return a mapping if decode is True (and never None). Raise ValueError on errors. """ if not qualifiers: return None if encode else {} if isinstance(qualifiers, basestring): qualifiers_str = qualifiers if isinstance(qualifiers, str) else qualifiers.decode("utf-8") # decode string to list of tuples qualifiers_list = qualifiers_str.split("&") if any("=" not in kv for kv in qualifiers_list): raise ValueError( f"Invalid qualifier. Must be a string of key=value pairs:{qualifiers_list!r}" ) qualifiers_parts = [kv.partition("=") for kv in qualifiers_list] qualifiers_pairs: Iterable[tuple[str, str]] = [(k, v) for k, _, v in qualifiers_parts] elif isinstance(qualifiers, dict): qualifiers_pairs = qualifiers.items() else: raise ValueError(f"Invalid qualifier. Must be a string or dict:{qualifiers!r}") quoter = get_quoter(encode) qualifiers_map = { k.strip().lower(): quoter(v) for k, v in qualifiers_pairs if k and k.strip() and v and v.strip() } valid_chars = string.ascii_letters + string.digits + ".-_" for key in qualifiers_map: if not key: raise ValueError("A qualifier key cannot be empty") if "%" in key: raise ValueError(f"A qualifier key cannot be percent encoded: {key!r}") if " " in key: raise ValueError(f"A qualifier key cannot contain spaces: {key!r}") if any(c not in valid_chars for c in key): raise ValueError( f"A qualifier key must be composed only of ASCII letters and numbers" f"period, dash and underscore: {key!r}" ) if key[0] in string.digits: raise ValueError(f"A qualifier key cannot start with a number: {key!r}") qualifiers_map = dict(sorted(qualifiers_map.items())) if not encode: return qualifiers_map return _qualifier_map_to_string(qualifiers_map) or None def _qualifier_map_to_string(qualifiers: dict[str, str]) -> str: qualifiers_list = [f"{key}={value}" for key, value in qualifiers.items()] return "&".join(qualifiers_list) def normalize_subpath(subpath: AnyStr | None, encode: bool | None = True) -> str | None: if not subpath: return None subpath_str = subpath if isinstance(subpath, str) else subpath.decode("utf-8") quoter = get_quoter(encode) segments = subpath_str.split("/") segments = [quoter(s) for s in segments if s.strip() and s not in (".", "..")] subpath_str = "/".join(segments) return subpath_str or None @overload def normalize( type: AnyStr | None, namespace: AnyStr | None, name: AnyStr | None, version: AnyStr | None, qualifiers: AnyStr | dict[str, str] | None, subpath: AnyStr | None, encode: Literal[True] = ..., ) -> tuple[str, str | None, str, str | None, str | None, str | None]: ... @overload def normalize( type: AnyStr | None, namespace: AnyStr | None, name: AnyStr | None, version: AnyStr | None, qualifiers: AnyStr | dict[str, str] | None, subpath: AnyStr | None, encode: Literal[False] | None, ) -> tuple[str, str | None, str, str | None, dict[str, str], str | None]: ... @overload def normalize( type: AnyStr | None, namespace: AnyStr | None, name: AnyStr | None, version: AnyStr | None, qualifiers: AnyStr | dict[str, str] | None, subpath: AnyStr | None, encode: bool | None = ..., ) -> tuple[str, str | None, str, str | None, str | dict[str, str] | None, str | None]: ... def normalize( type: AnyStr | None, namespace: AnyStr | None, name: AnyStr | None, version: AnyStr | None, qualifiers: AnyStr | dict[str, str] | None, subpath: AnyStr | None, encode: bool | None = True, ) -> tuple[ str | None, str | None, str | None, str | None, str | dict[str, str] | None, str | None, ]: """ Return normalized purl components """ type_norm = normalize_type(type, encode) namespace_norm = normalize_namespace(namespace, type_norm, encode) name_norm = normalize_name(name, qualifiers, type_norm, encode) version_norm = normalize_version(version, type, encode) qualifiers_norm = normalize_qualifiers(qualifiers, encode) subpath_norm = normalize_subpath(subpath, encode) return type_norm, namespace_norm, name_norm, version_norm, qualifiers_norm, subpath_norm class PackageURL( namedtuple("PackageURL", ("type", "namespace", "name", "version", "qualifiers", "subpath")) ): """ A purl is a package URL as defined at https://github.com/package-url/purl-spec """ SCHEME: ClassVar[str] = "pkg" type: str namespace: str | None name: str version: str | None qualifiers: dict[str, str] subpath: str | None def __new__( cls, type: AnyStr | None = None, namespace: AnyStr | None = None, name: AnyStr | None = None, version: AnyStr | None = None, qualifiers: AnyStr | dict[str, str] | None = None, subpath: AnyStr | None = None, normalize_purl: bool = True, ) -> Self: required = dict(type=type, name=name) for key, value in required.items(): if value: continue raise ValueError(f"Invalid purl: {key} is a required argument.") strings = dict( type=type, namespace=namespace, name=name, version=version, subpath=subpath, ) for key, value in strings.items(): if value and isinstance(value, basestring) or not value: continue raise ValueError(f"Invalid purl: {key} argument must be a string: {value!r}.") if qualifiers and not isinstance(qualifiers, (basestring, dict)): raise ValueError( f"Invalid purl: qualifiers argument must be a dict or a string: {qualifiers!r}." ) type_final: str namespace_final: Optional[str] name_final: str version_final: Optional[str] qualifiers_final: dict[str, str] subpath_final: Optional[str] if normalize_purl: ( type_final, namespace_final, name_final, version_final, qualifiers_final, subpath_final, ) = normalize(type, namespace, name, version, qualifiers, subpath, encode=None) else: from packageurl.utils import ensure_str type_final = ensure_str(type) or "" namespace_final = ensure_str(namespace) name_final = ensure_str(name) or "" version_final = ensure_str(version) if isinstance(qualifiers, dict): qualifiers_final = qualifiers else: qualifiers_final = {} subpath_final = ensure_str(subpath) return super().__new__( cls, type=type_final, namespace=namespace_final, name=name_final, version=version_final, qualifiers=qualifiers_final, subpath=subpath_final, ) def __str__(self, *args: Any, **kwargs: Any) -> str: return self.to_string() def __hash__(self) -> int: return hash(self.to_string()) def to_dict(self, encode: bool | None = False, empty: Any = None) -> dict[str, Any]: """ Return an ordered dict of purl components as {key: value}. If `encode` is True, then "qualifiers" are encoded as a normalized string. Otherwise, qualifiers is a mapping. You can provide a value for `empty` to be used in place of default None. """ data = self._asdict() if encode: data["qualifiers"] = normalize_qualifiers(self.qualifiers, encode=encode) for field, value in data.items(): data[field] = value or empty return data def to_string(self, encode: bool | None = True) -> str: """ Return a purl string built from components. """ type, namespace, name, version, qualifiers, subpath = normalize( self.type, self.namespace, self.name, self.version, self.qualifiers, self.subpath, encode=encode, ) purl = [self.SCHEME, ":", type, "/"] if namespace: purl.extend((namespace, "/")) purl.append(name) if version: purl.append("@") purl.append(version) if qualifiers: purl.append("?") if isinstance(qualifiers, Mapping): qualifiers = _qualifier_map_to_string(qualifiers) purl.append(qualifiers) if subpath: purl.append("#") purl.append(subpath) return "".join(purl) def validate(self, strict: bool = False) -> list["ValidationMessage"]: """ Validate this PackageURL object and return a list of validation error messages. """ from packageurl.validate import DEFINITIONS_BY_TYPE validator_class = DEFINITIONS_BY_TYPE.get(self.type) if not validator_class: return [ ValidationMessage( severity=ValidationSeverity.ERROR, message=f"Unexpected purl type: expected {self.type!r}", ) ] return list(validator_class.validate(purl=self, strict=strict)) # type: ignore[no-untyped-call] @classmethod def validate_string(cls, purl: str, strict: bool = False) -> list["ValidationMessage"]: """ Validate a PURL string and return a list of validation error messages. """ try: purl_obj = cls.from_string(purl, normalize_purl=not strict) assert isinstance(purl_obj, PackageURL) return purl_obj.validate(strict=strict) except ValueError as e: return [ ValidationMessage( severity=ValidationSeverity.ERROR, message=str(e), ) ] @classmethod def from_string(cls, purl: str, normalize_purl: bool = True) -> Self: """ Return a PackageURL object parsed from a string. Raise ValueError on errors. """ if not purl or not isinstance(purl, str) or not purl.strip(): raise ValueError("A purl string argument is required.") scheme, sep, remainder = purl.partition(":") if not sep or scheme != cls.SCHEME: raise ValueError( f'purl is missing the required "{cls.SCHEME}" scheme component: {purl!r}.' ) # this strip '/, // and /// as possible in :// or :/// remainder = remainder.strip().lstrip("/") version: str | None # this line is just for type hinting subpath: str | None # this line is just for type hinting type_, sep, remainder = remainder.partition("/") if not type_ or not sep: raise ValueError(f"purl is missing the required type component: {purl!r}.") valid_chars = string.ascii_letters + string.digits + ".-_" if not all(c in valid_chars for c in type_): raise ValueError( f"purl type must be composed only of ASCII letters and numbers, period, dash and underscore: {type_!r}." ) if type_[0] in string.digits: raise ValueError(f"purl type cannot start with a number: {type_!r}.") type_ = type_.lower() original_remainder = remainder scheme, authority, path, qualifiers_str, subpath = _urlsplit( url=remainder, scheme="", allow_fragments=True ) # The spec (seems) to allow colons in the name and namespace. # urllib.urlsplit splits on : considers them parts of scheme # and authority. # Other libraries do not care about this. # See https://github.com/package-url/packageurl-python/issues/152#issuecomment-2637692538 # We do + ":" + to put the colon back that urlsplit removed. if authority: path = authority + ":" + path if scheme: # This is a way to preserve the casing of the original scheme original_scheme = original_remainder.split(":", 1)[0] path = original_scheme + ":" + path path = path.lstrip("/") namespace: str | None = "" # NPM purl have a namespace in the path # and the namespace in an npm purl is # different from others because it starts with `@` # so we need to handle this case separately if type_ == "npm" and path.startswith("@"): namespace, sep, path = path.partition("/") remainder, sep, version = path.rpartition("@") if not sep: remainder = version version = None ns_name = remainder.strip().strip("/") ns_name_parts = ns_name.split("/") ns_name_parts = [seg for seg in ns_name_parts if seg and seg.strip()] name = "" if not namespace and len(ns_name_parts) > 1: name = ns_name_parts[-1] ns = ns_name_parts[:-1] namespace = "/".join(ns) elif len(ns_name_parts) == 1: name = ns_name_parts[0] if not name: raise ValueError(f"purl is missing the required name component: {purl!r}") if normalize_purl: type_, namespace, name, version, qualifiers, subpath = normalize( type_, namespace, name, version, qualifiers_str, subpath, encode=False, ) else: qualifiers = normalize_qualifiers(qualifiers_str, encode=False) or {} return cls( type_, namespace, name, version, qualifiers, subpath, normalize_purl=normalize_purl )