Source code for qs_codec.utils.decode_utils

"""Utilities for decoding percent-encoded query strings and splitting composite keys into bracketed path segments.

This mirrors the semantics of the Node `qs` library:

- Decoding handles both UTF-8 and Latin-1 code paths.
- Key splitting keeps bracket groups *balanced* and optionally treats dots as path separators when ``allow_dots=True``.
- Top-level dot splitting uses a character-scanner that handles degenerate cases (leading '.' starts a bracket segment; '.[' is skipped; double dots preserve the first; trailing '.' is preserved) and never treats literal percent-encoded sequences (e.g., '%2E') as split points; only actual '.' characters at depth 0 are split.
"""

import re
import typing as t
from urllib.parse import unquote

from ..enums.charset import Charset
from ..enums.decode_kind import DecodeKind



[docs]
class DecodeUtils:
    """Decode helpers compiled into a single, importable namespace.

    All methods are classmethods so they are easy to stub/patch in tests, and
    the compiled regular expressions are created once per interpreter session.
    """

    # Matches either a 16-bit JavaScript-style %uXXXX sequence or a single-byte
    # %XX sequence. Used by `unescape` to emulate legacy browser behavior.
    UNESCAPE_PATTERN: t.Pattern[str] = re.compile(
        r"%u(?P<unicode>[0-9A-Fa-f]{4})|%(?P<hex>[0-9A-Fa-f]{2})",
        re.IGNORECASE,
    )


[docs]
    @classmethod
    def dot_to_bracket_top_level(cls, s: str) -> str:
        """Convert top-level dot segments into bracket groups *after* percent-decoding.

        Notes
        -----
        - In the normal decode path, the key has already been percent-decoded by the upstream
          scanner, so sequences like ``%2E``/``%2e`` are already literal ``.`` when this function
          runs. As a result, with ``allow_dots=True``, any top-level ``.`` will be treated as a
          separator here. This is independent of ``decode_dot_in_keys`` (which only affects how
          encoded dots *inside bracket segments* are normalized later during object folding).
        - If a custom decoder returns raw tokens (i.e., bypasses percent-decoding), ``%2E``/``%2e``
          may still appear here; those percent sequences are preserved verbatim and are **not**
          used as separators.

        Rules
        -----
        - Only dots at depth == 0 split. Dots inside ``[]`` are preserved.
        - Degenerate cases:
          * leading ``.`` starts a bracket segment (``.a`` behaves like ``[a]``)
          * ``.[`` is skipped so ``a.[b]`` behaves like ``a[b]``
          * ``a..b`` preserves the first dot → ``a.[b]``
          * trailing ``.`` is preserved and ignored by the splitter

        Examples
        --------
        'user.email.name' -> 'user[email][name]'
        'a[b].c' -> 'a[b][c]'
        'a[.].c' -> 'a[.][c]'
        'a%2E[b]' -> 'a%2E[b]' (only if a custom decoder left it encoded)
        """
        if "." not in s:
            return s
        sb: t.List[str] = []
        depth = 0
        i = 0
        n = len(s)
        while i < n:
            ch = s[i]
            if ch == "[":
                depth += 1
                sb.append(ch)
                i += 1
            elif ch == "]":
                if depth > 0:
                    depth -= 1
                sb.append(ch)
                i += 1
            elif ch == ".":
                if depth == 0:
                    has_next = i + 1 < n
                    next_ch = s[i + 1] if has_next else "\0"
                    if next_ch == "[":
                        # skip the dot so 'a.[b]' acts like 'a[b]'
                        i += 1
                    elif next_ch == "]":
                        # preserve ambiguous '.]' as a literal to avoid constructing '[]]'
                        sb.append(".")
                        i += 1
                    elif i == 0:
                        # If input starts with '..', preserve the first dot like the 'a..b' case.
                        if has_next and next_ch == ".":
                            sb.append(".")
                            i += 1
                            continue
                        # leading '.' starts a bracket segment: ".a" -> "[a]"
                        start = i + 1
                        j = start
                        while j < n and s[j] != "." and s[j] != "[" and s[j] != "]":
                            j += 1
                        sb.append("[")
                        sb.append(s[start:j])
                        sb.append("]")
                        i = j
                    elif (not has_next) or next_ch == ".":
                        # trailing dot, or first of a double dot
                        sb.append(".")
                        i += 1
                    else:
                        # normal split at top level: convert a.b → a[b]
                        start = i + 1
                        j = start
                        while j < n and s[j] != "." and s[j] != "[" and s[j] != "]":
                            j += 1
                        sb.append("[")
                        sb.append(s[start:j])
                        sb.append("]")
                        i = j
                else:
                    sb.append(".")
                    i += 1
            else:
                # No special handling for percent sequences here; characters are appended as-is.
                # We never split on '%2E' at this stage.
                sb.append(ch)
                i += 1
        return "".join(sb)


    # Precompiled pattern for %XX hex bytes (Latin-1 path fast path)
    HEX2_PATTERN: t.Pattern[str] = re.compile(r"%([0-9A-Fa-f]{2})")


[docs]
    @classmethod
    def unescape(cls, string: str) -> str:
        """Emulate legacy JavaScript unescape behavior.

        Replaces both ``%XX`` and ``%uXXXX`` escape sequences with the
        corresponding code points. This function is intentionally permissive
        and does not validate UTF-8; it is used to model historical behavior
        in Latin-1 mode.

        Examples
        --------
        >>> DecodeUtils.unescape("%u0041%20%42")
        'A B'
        >>> DecodeUtils.unescape("%7E")
        '~'
        """
        # Fast path: nothing to unescape
        if "%" not in string:
            return string

        def replacer(match: t.Match[str]) -> str:
            if (unicode_val := match.group("unicode")) is not None:
                return chr(int(unicode_val, 16))
            elif (hex_val := match.group("hex")) is not None:
                return chr(int(hex_val, 16))
            return match.group(0)

        return cls.UNESCAPE_PATTERN.sub(replacer, string)



[docs]
    @classmethod
    def decode(
        cls,
        string: t.Optional[str],
        charset: t.Optional[Charset] = Charset.UTF8,
        kind: DecodeKind = DecodeKind.VALUE,  # pylint: disable=unused-argument
    ) -> t.Optional[str]:
        """Decode a URL-encoded scalar.

        Notes
        -----
        The `kind` parameter is accepted for API compatibility but is currently
        ignored; keys and values are decoded identically. It may be removed in
        a future major release.

        Behavior:
        - Replace ``+`` with a literal space *before* decoding.
        - If ``charset`` is :data:`~qs_codec.enums.charset.Charset.LATIN1`, decode only ``%XX`` byte sequences (no ``%uXXXX``). ``%uXXXX`` sequences are left as-is to mimic older browser/JS behavior.
        - Otherwise (UTF-8), defer to :func:`urllib.parse.unquote`.
        - Keys and values are decoded identically; whether a literal ``.`` acts as a key separator is decided later by the key-splitting logic.

        Returns
        -------
        Optional[str]
            ``None`` when the input is ``None``.
        """
        if string is None:
            return None

        # Replace '+' with ' ' only if present to avoid allocation.
        string_without_plus: str = string.replace("+", " ") if "+" in string else string

        if charset == Charset.LATIN1:
            # Only process %XX hex escape sequences for Latin-1 (no %uXXXX expansion here).
            s = string_without_plus
            if "%" not in s:
                return s
            _int, _chr = int, chr
            return cls.HEX2_PATTERN.sub(lambda m: _chr(_int(m.group(1), 16)), s)

        s = string_without_plus
        return s if "%" not in s else unquote(s)



[docs]
    @classmethod
    def split_key_into_segments(
        cls,
        original_key: str,
        allow_dots: bool,
        max_depth: int,
        strict_depth: bool,
    ) -> t.List[str]:
        """Split a composite key into *balanced* bracket segments.

        - If ``allow_dots`` is True, convert **top-level** dots to bracket groups using a character-scanner (``a.b[c]`` → ``a[b][c]``), preserving dots inside brackets and degenerate cases.
        - The *parent* (non-bracket) prefix becomes the first segment, e.g. ``"a[b][c]"`` → ``["a", "[b]", "[c]"]``.
        - Bracket groups are *balanced* using a counter so nested brackets within a single group (e.g. ``"[with[inner]]"``) are treated as one segment.
        - When ``max_depth <= 0``, no splitting occurs; the key is returned as a single segment (qs semantics).
        - If there are more groups beyond ``max_depth`` and ``strict_depth`` is True, an ``IndexError`` is raised. Otherwise, the remainder is added as one final segment (again mirroring qs).
        - Unterminated '[': the remainder after the first unmatched '[' is captured as a single synthetic bracket segment.

        Examples
        --------
        max_depth=2: "a[b][c][d]" -> ["a", "[b]", "[c]", "[[d]]"]
        unterminated: "a[b" -> ["a", "[[b]"]

        This runs in O(n) time over the key string.
        """
        if max_depth <= 0:
            return [original_key]

        key: str = cls.dot_to_bracket_top_level(original_key) if allow_dots else original_key

        segments: t.List[str] = []

        first: int = key.find("[")
        parent: str = key[:first] if first >= 0 else key
        # Capture the non-bracket parent prefix (may be empty).
        if parent:
            segments.append(parent)

        n: int = len(key)
        open_idx: int = first
        depth: int = 0

        unterminated = False
        while open_idx >= 0 and depth < max_depth:
            level = 1
            i = open_idx + 1
            close = -1

            # Balance nested '[' and ']' inside the same group,
            # so "[withbracket[]]" is treated as *one* segment.
            while i < n:
                ch = key[i]
                if ch == "[":
                    level += 1
                elif ch == "]":
                    level -= 1
                    if level == 0:
                        close = i
                        break
                i += 1

            if close < 0:
                unterminated = True  # unterminated group; stop collecting; remainder handled below
                break

            # Append the full balanced group, including the surrounding brackets.
            segments.append(key[open_idx : close + 1])  # includes the surrounding [ ]
            depth += 1
            open_idx = key.find("[", close + 1)

        if open_idx >= 0:
            # We only want to raise for true depth overflow under strict_depth,
            # not for unterminated bracket groups.
            depth_overflow = (depth >= max_depth) and not unterminated
            if strict_depth and depth_overflow:
                raise IndexError(f"Input depth exceeded depth option of {max_depth} and strict_depth is True")
            # Stash the remainder as a single segment (qs parity)
            segments.append("[" + key[open_idx:] + "]")

        return segments
Source code for qs_codec.utils.decode_utils

qs-codec

Navigation

Related Topics