Source code for qs_codec.utils.encode_utils

"""A collection of encode utility methods used by the library."""

import re
import typing as t
from datetime import datetime
from decimal import Decimal
from enum import Enum

from ..enums.charset import Charset
from ..enums.format import Format


[docs] class EncodeUtils: """A collection of encode utility methods used by the library.""" HEX_TABLE: t.Tuple[str, ...] = tuple(f"%{i:02X}" for i in range(256)) """Hex table of all 256 characters""" SAFE_ALPHA: t.Set[int] = set(range(0x30, 0x3A)) | set(range(0x41, 0x5B)) | set(range(0x61, 0x7B)) """0-9, A-Z, a-z""" SAFE_POINTS: t.Set[int] = SAFE_ALPHA | {0x40, 0x2A, 0x5F, 0x2D, 0x2B, 0x2E, 0x2F} """0-9, A-Z, a-z, @, *, _, -, +, ., /""" RFC1738_SAFE_POINTS: t.Set[int] = SAFE_POINTS | {0x28, 0x29} """0-9, A-Z, a-z, @, *, _, -, +, ., /, (, )""" SAFE_CHARS: t.Set[int] = SAFE_ALPHA | {0x2D, 0x2E, 0x5F, 0x7E} """0-9, A-Z, a-z, -, ., _, ~""" RFC1738_SAFE_CHARS: t.Set[int] = SAFE_CHARS | {0x28, 0x29} """0-9, A-Z, a-z, -, ., _, ~, (, )""" SAFE_POINTS_ASCII: t.Tuple[bool, ...] = tuple(map(SAFE_POINTS.__contains__, range(0x80))) RFC1738_SAFE_POINTS_ASCII: t.Tuple[bool, ...] = tuple(map(RFC1738_SAFE_POINTS.__contains__, range(0x80))) SAFE_CHARS_ASCII: t.Tuple[bool, ...] = tuple(map(SAFE_CHARS.__contains__, range(0x80))) RFC1738_SAFE_CHARS_ASCII: t.Tuple[bool, ...] = tuple(map(RFC1738_SAFE_CHARS.__contains__, range(0x80))) _RE_UXXXX = re.compile(r"%u([0-9a-fA-F]{4})")
[docs] @classmethod def escape( cls, string: str, format: t.Optional[Format] = Format.RFC3986, ) -> str: """Emulate the legacy JavaScript escaping behavior. This function operates on UTF-16 *code units* to emulate JavaScript's legacy `%uXXXX` behavior. Non-BMP code points are first expanded into surrogate pairs via `_to_surrogates`, then each code unit is processed. - Safe set: when `format == Format.RFC1738`, the characters `(` and `)` are additionally treated as safe. Otherwise, the RFC3986 safe set is used. - ASCII characters in the safe set are emitted unchanged. - Code units &lt; 256 are emitted as `%XX`. - Other code units are emitted as `%uXXXX`. Reference: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/escape """ safe_points_ascii = cls.RFC1738_SAFE_POINTS_ASCII if format == Format.RFC1738 else cls.SAFE_POINTS_ASCII if string.isascii() and cls._all_ascii_safe(string, safe_points_ascii): return string if not string.isascii(): string = cls._to_surrogates(string) hex_table = cls.HEX_TABLE buffer: t.List[str] = [] i: int = 0 n: int = len(string) while i < n: c: int = ord(string[i]) # If we detect a high surrogate and there is a following low surrogate, encode both. if 0xD800 <= c <= 0xDBFF and (i + 1) < n: next_c: int = ord(string[i + 1]) if 0xDC00 <= next_c <= 0xDFFF: buffer.append(f"%u{c:04X}") buffer.append(f"%u{next_c:04X}") i += 2 continue if c < 0x80 and safe_points_ascii[c]: buffer.append(string[i]) elif c < 256: buffer.append(hex_table[c]) else: buffer.append(f"%u{c:04X}") i += 1 return "".join(buffer)
[docs] @classmethod def encode( cls, value: t.Any, charset: t.Optional[Charset] = Charset.UTF8, format: t.Optional[Format] = Format.RFC3986, ) -> str: """Encode a scalar value to a URL-encoded string. - Accepts numbers, `Decimal`, `Enum`, `str`, `bool`, and `bytes`. Any other type (including `None`) yields an empty string, matching the Node `qs` behavior. - For `Charset.LATIN1`, the output mirrors the JS `%uXXXX` + numeric entity trick so the result can be safely transported as latin-1. - Otherwise, values are encoded as UTF-8 using `_encode_string`. """ if value is None or not isinstance(value, (int, float, Decimal, Enum, str, bool, bytes)): return "" string: str = cls._convert_value_to_string(value) if not string: return "" if charset == Charset.LATIN1: _pat = cls._RE_UXXXX _esc = cls.escape(string, format) return _pat.sub(lambda m: f"%26%23{int(m.group(1), 16)}%3B", _esc) return cls._encode_string(string, format)
@staticmethod def _convert_value_to_string(value: t.Any) -> str: """Coerce a supported scalar to `str`. - `bytes` are decoded as UTF-8. - `bool` values are lower-cased (`"true"` / `"false"`). - `str` passes through. - All other supported scalars use `str(value)`. """ if isinstance(value, bytes): return value.decode("utf-8") elif isinstance(value, bool): return str(value).lower() elif isinstance(value, str): return value else: return str(value) @classmethod def _encode_string(cls, string: str, format: t.Optional[Format]) -> str: """Percent-encode `string` per RFC3986 or RFC1738, operating on UTF-16 code units. We first expand non-BMP code points into surrogate pairs so that indexing and length checks are done in *code units*, matching JavaScript semantics. We then walk the string with a manual index, skipping the low surrogate when we emit a surrogate pair. """ safe_chars_ascii = cls.RFC1738_SAFE_CHARS_ASCII if format == Format.RFC1738 else cls.SAFE_CHARS_ASCII if string.isascii(): if cls._all_ascii_safe(string, safe_chars_ascii): return string s = string else: s = cls._to_surrogates(string) hex_table = cls.HEX_TABLE buffer: t.List[str] = [] i = 0 n = len(s) while i < n: c = ord(s[i]) if c < 0x80 and safe_chars_ascii[c]: buffer.append(s[i]) i += 1 continue # ASCII if c < 0x80: buffer.append(hex_table[c]) i += 1 continue # Two-byte UTF-8 if c < 0x800: buffer.extend( [ hex_table[0xC0 | (c >> 6)], hex_table[0x80 | (c & 0x3F)], ] ) i += 1 continue # Surrogates → 4-byte UTF-8 (only when a valid high+low pair is present) if 0xD800 <= c <= 0xDBFF and (i + 1) < n: next_c = ord(s[i + 1]) if 0xDC00 <= next_c <= 0xDFFF: buffer.extend(cls._encode_surrogate_pair(s, i, c)) i += 2 continue # 3-byte UTF-8 (non-surrogate BMP) buffer.extend( [ hex_table[0xE0 | (c >> 12)], hex_table[0x80 | ((c >> 6) & 0x3F)], hex_table[0x80 | (c & 0x3F)], ] ) i += 1 return "".join(buffer) @classmethod def _is_safe_char(cls, c: int, format: t.Optional[Format]) -> bool: """Return True if code unit `c` is allowed unescaped for the given `format`.""" safe_chars_ascii = cls.RFC1738_SAFE_CHARS_ASCII if format == Format.RFC1738 else cls.SAFE_CHARS_ASCII return c < 0x80 and safe_chars_ascii[c] @classmethod def _encode_char(cls, string: str, i: int, c: int) -> t.List[str]: """Encode one UTF-16 code unit (at index `i`) into percent-encoded UTF-8 bytes. - ASCII (`c &lt; 0x80`) → single `%XX`. - Two-byte, three-byte UTF-8 forms as needed. - If `c` is a surrogate, defer to `_encode_surrogate_pair`. """ if c < 0x80: # ASCII return [cls.HEX_TABLE[c]] elif c < 0x800: # 2 bytes return [ cls.HEX_TABLE[0xC0 | (c >> 6)], cls.HEX_TABLE[0x80 | (c & 0x3F)], ] elif c < 0xD800 or c >= 0xE000: # 3 bytes return [ cls.HEX_TABLE[0xE0 | (c >> 12)], cls.HEX_TABLE[0x80 | ((c >> 6) & 0x3F)], cls.HEX_TABLE[0x80 | (c & 0x3F)], ] else: return cls._encode_surrogate_pair(string, i, c) @classmethod def _encode_surrogate_pair(cls, string: str, i: int, c: int) -> t.List[str]: """Encode a surrogate pair starting at `i` as a 4-byte UTF-8 sequence.""" buffer: t.List[str] = [] low = ord(string[i + 1]) c = 0x10000 + (((c & 0x3FF) << 10) | (low & 0x3FF)) buffer.extend( [ cls.HEX_TABLE[0xF0 | (c >> 18)], cls.HEX_TABLE[0x80 | ((c >> 12) & 0x3F)], cls.HEX_TABLE[0x80 | ((c >> 6) & 0x3F)], cls.HEX_TABLE[0x80 | (c & 0x3F)], ], ) return buffer @staticmethod def _to_surrogates(string: str) -> str: """Expand non-BMP code points (code point &gt; 0xFFFF) into UTF-16 surrogate pairs. This mirrors how JavaScript strings store characters, allowing compatibility with legacy `%uXXXX` encoding paths and consistent behavior with the Node `qs` implementation. If no non-BMP code point is present, the original string is returned unchanged. """ buffer: t.Optional[t.List[str]] = None for index, ch in enumerate(string): cp = ord(ch) if cp <= 0xFFFF: if buffer is not None: buffer.append(ch) continue if buffer is None: buffer = list(string[:index]) # Convert to surrogate pair. cp -= 0x10000 high = 0xD800 + (cp >> 10) low = 0xDC00 + (cp & 0x3FF) buffer.append(chr(high)) buffer.append(chr(low)) return string if buffer is None else "".join(buffer) @staticmethod def _all_ascii_safe(string: str, safe_table: t.Tuple[bool, ...]) -> bool: """Return True when every ASCII code unit in `string` is marked safe by `safe_table`. Callers must ensure `string.isascii()` is `True` before calling this helper. `safe_table` is indexed by `ord(ch)` and is sized only for ASCII code units. """ for ch in string: if not safe_table[ord(ch)]: return False return True
[docs] @staticmethod def serialize_date(dt: datetime) -> str: """Serialize a `datetime` to ISO-8601 using `datetime.isoformat()`.""" return dt.isoformat()