diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 593a343e1..013f7ea45 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -523,7 +523,7 @@ jobs: enable-cache: true # Download the Linux wheel built in the previous job. - # Docs only need the abi3 wheel — interpreter doesn't matter for sphinx. + # Docs only need the abi3 wheel — interpreter doesn't matter for mkdocs. - name: Download pre-built Linux wheel uses: actions/download-artifact@v8 with: @@ -549,12 +549,19 @@ jobs: fi - name: Build docs + env: + DISABLE_MKDOCS_2_WARNING: "true" run: | set -x - cd docs + # Stage notebook data files at docs_dir root so notebooks can + # resolve relative paths like "pokemon.csv" during execution. + cd docs/source curl -O https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet - uv run --no-project make html + cd ../.. + # Verify every datafusion.__all__ entry is documented. + uv run --no-project python dev/check_api_coverage.py + uv run --no-project mkdocs build - name: Copy & push the generated HTML if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref_type == 'tag') diff --git a/.gitignore b/.gitignore index 614d82327..198a0b041 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ target .idea /docs/temp /docs/build +/.cache .DS_Store .vscode diff --git a/AGENTS.md b/AGENTS.md index 632d6ebc0..fda08b23c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -84,9 +84,9 @@ Every Python function must include a docstring with usage examples. When adding or updating an aggregate or window function, ensure the corresponding site documentation is kept in sync: -- **Aggregations**: `docs/source/user-guide/common-operations/aggregations.rst` — +- **Aggregations**: `docs/source/user-guide/common-operations/aggregations.md` — add new aggregate functions to the "Aggregate Functions" list and include usage examples if appropriate. -- **Window functions**: `docs/source/user-guide/common-operations/windows.rst` — +- **Window functions**: `docs/source/user-guide/common-operations/windows.md` — add new window functions to the "Available Functions" list and include usage examples if appropriate. diff --git a/dev/check_api_coverage.py b/dev/check_api_coverage.py new file mode 100644 index 000000000..d18f659cf --- /dev/null +++ b/dev/check_api_coverage.py @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Check that every symbol in datafusion.__all__ is documented. + +Walks every Markdown file under docs/source/reference/ and collects: + +1. The dotted target of every ``::: `` mkdocstrings directive. +2. Every Markdown heading (``##``, ``###``, etc.). + +A ``__all__`` entry is considered documented if its name appears as: + +- The leaf of a ``::: <...>`` directive, OR +- The leaf of a ``### name`` heading. + +Run from the repo root:: + + python dev/check_api_coverage.py +""" + +from __future__ import annotations + +import re +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[1] +REFERENCE_DIR = REPO_ROOT / "docs" / "source" / "reference" + + +def collect_documented_names() -> set[str]: + documented: set[str] = set() + directive_re = re.compile(r"^:::\s+([A-Za-z0-9_.]+)") + heading_re = re.compile(r"^#{1,6}\s+([A-Za-z0-9_]+)") + for md in REFERENCE_DIR.rglob("*.md"): + if md.stem != "index": + documented.add(md.stem) + for line in md.read_text().splitlines(): + m = directive_re.match(line.strip()) + if m: + dotted = m.group(1) + documented.add(dotted.split(".")[-1]) + documented.add(dotted) + continue + m = heading_re.match(line) + if m: + documented.add(m.group(1)) + return documented + + +def main() -> int: + sys.path.insert(0, str(REPO_ROOT / "python")) + import datafusion # noqa: PLC0415 + + documented = collect_documented_names() + missing = sorted(name for name in datafusion.__all__ if name not in documented) + if missing: + print("Undocumented entries in datafusion.__all__:") + for name in missing: + print(f" - {name}") + print(f"\n{len(missing)} symbol(s) missing from docs/source/reference/") + return 1 + print(f"All {len(datafusion.__all__)} __all__ entries are documented.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/dev/rewrite_doc_roles.py b/dev/rewrite_doc_roles.py new file mode 100644 index 000000000..25a40ee79 --- /dev/null +++ b/dev/rewrite_doc_roles.py @@ -0,0 +1,151 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Rewrite Sphinx / MyST cross-reference roles to Markdown links. + +Operates on: +- python/datafusion/*.py docstrings +- docs/source/**/*.md +- docs/source/**/*.ipynb (markdown cells) + +Conversions: + + :py:class:`~datafusion.x.Y` -> [`Y`][datafusion.x.Y] + :py:func:`~mod.fn` -> [`fn`][mod.fn] + :py:meth:`X.do ` -> [`X.do`][X.do] + {py:class}`~datafusion.x.Y` -> [`Y`][datafusion.x.Y] + {py:func}`mod.fn` -> [`mod.fn`][mod.fn] + {py:mod}`mod` -> [`mod`][mod] + {code}`text` -> `text` + {doc}`path/to/page` -> [path/to/page](path/to/page.md) + {doc}`Label ` -> [Label](path/to/page.md) + {ref}`anchor` -> [anchor](anchor) (best-effort) + {ref}`Label ` -> [Label](anchor) + (label)= (alone on a line) -> removed +""" + +from __future__ import annotations + +import json +import re +import sys +from pathlib import Path + +REPO = Path(__file__).resolve().parents[1] + +ROLE_PATTERNS = [ + # Sphinx RST roles: :py:class:`~mod.Name`, :class:`~mod.Name`, plus + # the `Name ` long form. Both `py:` and bare role names. + ( + re.compile( + r":(?:py:)?(?:class|func|meth|mod|attr|obj|data|exc):`~?\.?([\w.]+)`" + ), + lambda m: f"[`{m.group(1).split('.')[-1]}`][{m.group(1)}]", + ), + ( + re.compile( + r":(?:py:)?(?:class|func|meth|mod|attr|obj|data|exc):`([^<`]+)\s*<\.?([\w.]+)>`" + ), + lambda m: f"[`{m.group(1).strip()}`][{m.group(2)}]", + ), + # MyST roles: {py:class}`~mod.Name` and the bare {class}`~mod.Name` aliases. + ( + re.compile( + r"\{(?:py:)?(?:class|func|meth|mod|attr|obj|data|exc)\}`~?\.?([\w.]+)`" + ), + lambda m: f"[`{m.group(1).split('.')[-1]}`][{m.group(1)}]", + ), + ( + re.compile( + r"\{(?:py:)?(?:class|func|meth|mod|attr|obj|data|exc)\}`([^<`]+)\s*<\.?([\w.]+)>`" + ), + lambda m: f"[`{m.group(1).strip()}`][{m.group(2)}]", + ), + # {code}`text`, {file}`path`, {samp}`text`, {kbd}`keys` -> `text` + ( + re.compile(r"\{(?:code|file|samp|kbd)\}`([^`]+)`"), + lambda m: f"`{m.group(1)}`", + ), + # {doc}`Label ` -> [Label](path.md) + ( + re.compile(r"\{doc\}`([^<`]+)\s*<([^>]+)>`"), + lambda m: f"[{m.group(1).strip()}]({m.group(2)}.md)", + ), + # {doc}`path` -> [path](path.md) + (re.compile(r"\{doc\}`([^`<]+)`"), lambda m: f"[{m.group(1)}]({m.group(1)}.md)"), + # {ref}`Label ` -> [Label](anchor) + ( + re.compile(r"\{ref\}`([^<`]+)\s*<([^>]+)>`"), + lambda m: f"[{m.group(1).strip()}]({m.group(2)})", + ), + # {ref}`anchor` -> [anchor](anchor) + (re.compile(r"\{ref\}`([^`<]+)`"), lambda m: f"[{m.group(1)}]({m.group(1)})"), +] + +# Drop standalone (label)= anchor lines (MyST cross-reference targets) +ANCHOR_LINE = re.compile(r"^\([a-zA-Z0-9_-]+\)=\s*$", re.MULTILINE) + + +def rewrite(text: str) -> str: + for pattern, repl in ROLE_PATTERNS: + text = pattern.sub(repl, text) + return ANCHOR_LINE.sub("", text) + + +def process_file(path: Path, *, dry_run: bool = False) -> int: + if path.suffix == ".ipynb": + original = path.read_text() + nb = json.loads(original) + changed = False + for cell in nb.get("cells", []): + if cell.get("cell_type") != "markdown": + continue + old = cell["source"] + text = "".join(old) if isinstance(old, list) else old + new = rewrite(text) + if new != text: + cell["source"] = new + changed = True + if changed and not dry_run: + path.write_text(json.dumps(nb, indent=1) + "\n") + return 1 if changed else 0 + + original = path.read_text() + new = rewrite(original) + if new != original: + if not dry_run: + path.write_text(new) + return 1 + return 0 + + +def main() -> int: + dry = "--dry-run" in sys.argv + paths = ( + list((REPO / "python" / "datafusion").rglob("*.py")) + + list((REPO / "docs" / "source").rglob("*.md")) + + list((REPO / "docs" / "source").rglob("*.ipynb")) + ) + changed = 0 + for p in paths: + changed += process_file(p, dry_run=dry) + print(f"changed: {changed} files" + (" (dry run)" if dry else "")) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docs/.gitignore b/docs/.gitignore index 6e8a53b6f..6f2465e60 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,4 +1,7 @@ pokemon.csv yellow_trip_data.parquet yellow_tripdata_2021-01.parquet - +source/pokemon.csv +source/yellow_trip_data.parquet +source/yellow_tripdata_2021-01.parquet +build/ diff --git a/docs/Makefile b/docs/Makefile index 49ebae372..8ad0ae144 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -15,24 +15,24 @@ # specific language governing permissions and limitations # under the License. -# -# Minimal makefile for Sphinx documentation -# +# Thin wrapper. The mkdocs.yml lives at the repo root; run `mkdocs build` +# from one directory up. -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -BUILDDIR = build +MKDOCS ?= mkdocs + +.PHONY: help html serve clean -# Put it first so that "make" without argument is like "make help". help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + @echo "Targets:" + @echo " html - build site to docs/build/html" + @echo " serve - serve site at http://localhost:8000" + @echo " clean - remove docs/build/" + +html: + cd .. && DISABLE_MKDOCS_2_WARNING=true $(MKDOCS) build --strict -.PHONY: help Makefile +serve: + cd .. && DISABLE_MKDOCS_2_WARNING=true $(MKDOCS) serve -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) --fail-on-warning \ No newline at end of file +clean: + rm -rf build/ diff --git a/docs/griffe_extensions.py b/docs/griffe_extensions.py new file mode 100644 index 000000000..aef19250e --- /dev/null +++ b/docs/griffe_extensions.py @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Griffe extensions for datafusion-python docs. + +`SphinxRefsToAutorefs` rewrites sphinx-style cross-reference roles +(``:func:`~path`, :class:`~path``, etc.) inside docstrings into +mkdocstrings autoref syntax (``[`tail`][path]``) so that the same +docstring renders as a clickable cross-reference both in JetBrains-style +IDEs (which understand sphinx roles) and on the published docs site +(which understands mkdocstrings autorefs). +""" + +from __future__ import annotations + +import re +from typing import Any + +from griffe import Extension, Object + +_ROLE_RE = re.compile( + r":(?:py:)?(?Pfunc|class|meth|attr|mod|obj|exc|const|data)" + r":`(?P~?)(?P[\w.]+)`" +) + + +def _rewrite(text: str) -> str: + def repl(match: re.Match[str]) -> str: + target = match.group("target") + tail = target.rsplit(".", 1)[-1] + return f"[`{tail}`][{target}]" + + return _ROLE_RE.sub(repl, text) + + +class SphinxRefsToAutorefs(Extension): + """Convert sphinx-style cross-references into mkdocstrings autorefs.""" + + def on_object(self, *, obj: Object, **_: Any) -> None: + docstring = obj.docstring + if docstring is None: + return + new = _rewrite(docstring.value) + if new != docstring.value: + docstring.value = new diff --git a/docs/hooks.py b/docs/hooks.py new file mode 100644 index 000000000..7a42542ad --- /dev/null +++ b/docs/hooks.py @@ -0,0 +1,105 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""MkDocs hooks for datafusion-python docs. + +Auto-injects a shared `markdown-exec` setup block at the top of every +user-guide page that contains executable Python code blocks. Page +authors write `.md` files with just prose plus `python exec="1" +session="..."` fences — they never have to copy/paste the shared +imports, `chdir`, or formatter configuration. + +The injected block uses the same session slug as the first executable +block on the page, so its imports and `chdir` carry over to the rest of +the page through markdown-exec's per-session globals. +""" + +from __future__ import annotations + +import re +from typing import Any + +# Matches `python exec="1" ... session=""` (slug captured). +# Tolerates attribute reordering. +_EXEC_FENCE = re.compile( + r'```python\s+[^\n]*?exec="1"[^\n]*?session="(?P[\w-]+)"', +) + +_SETUP_TEMPLATE = """```python exec="1" session="{slug}" +import os +import pathlib + +import datafusion # noqa: F401 +import datafusion.dataframe +from datafusion import ( # noqa: F401 + SessionContext, + col, + column, + lit, + literal, +) +from datafusion import functions as f # noqa: F401 +from datafusion.dataframe_formatter import configure_formatter + +# mkdocs build runs from the repo root; mkdocs serve from `docs/`. Walk +# the local candidates to find the demo data so pages resolve +# `pokemon.csv` regardless of which one is in use. +for _candidate in ("docs/source", "source", "."): + _p = pathlib.Path(_candidate) + if (_p / "pokemon.csv").exists(): + os.chdir(_p) + break + +configure_formatter(max_rows=10, show_truncation_message=False) + + +# `DataFrame.show()` writes through Rust's libc stdout (fd 1), bypassing +# Python's `sys.stdout` redirect that markdown-exec installs. Override +# it to route through Python's print() so the table appears in the +# captured output. The Python `__repr__` produces the same ASCII table. +def _show(self, *_args, **_kwargs): + print(self) + + +datafusion.dataframe.DataFrame.show = _show + + +# `DataFrame.__repr__` appends a literal "Data truncated." footer that +# the HTML-side `show_truncation_message=False` option does not affect. +# Strip it so the rendered docs do not advertise truncation on every +# example DataFrame. +_orig_repr = datafusion.dataframe.DataFrame.__repr__ + + +def _repr(self): + text = _orig_repr(self).rstrip() + if text.endswith("Data truncated."): + text = text[: -len("Data truncated.")].rstrip() + return text + + +datafusion.dataframe.DataFrame.__repr__ = _repr +``` +""" + + +def on_page_markdown(markdown: str, **_: Any) -> str: + """Prepend a setup `markdown-exec` block when the page uses code execution.""" + match = _EXEC_FENCE.search(markdown) + if match is None: + return markdown + return _SETUP_TEMPLATE.format(slug=match.group("slug")) + "\n" + markdown diff --git a/docs/source/_overrides/partials/copyright.html b/docs/source/_overrides/partials/copyright.html new file mode 100644 index 000000000..2b034896b --- /dev/null +++ b/docs/source/_overrides/partials/copyright.html @@ -0,0 +1,31 @@ +{#- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +-#} + diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index 661454b12..88602898a 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -17,49 +17,73 @@ * under the License. */ - -/* Customizing with theme CSS variables */ +/* Apache DataFusion accent color: rgb(215, 70, 51) == #D74633. + * + * Header bar is white in light mode / black in dark mode (Material palette + * primary: white|black). Red is applied only to links, code, and accents + * to match the pydata-sphinx-theme look used by datafusion-comet. + */ :root { - --pst-color-link-hover: 215, 70, 51; - --pst-color-headerlink: 215, 70, 51; - /* Softer blue from bootstrap's default info color */ - --pst-color-info: 23, 162, 184; + --md-accent-fg-color: #D74633; + --md-accent-fg-color--transparent: rgba(215, 70, 51, 0.1); + --md-typeset-a-color: #D74633; } -code { - color: rgb(215, 70, 51); +[data-md-color-scheme="slate"] { + --md-accent-fg-color: #FF8A75; + --md-typeset-a-color: #FF8A75; } -html[data-theme="dark"] code { - color: rgb(255, 138, 117); +/* Inline code styled with accent color */ +.md-typeset code { + color: #D74633; } -.footer { - text-align: center; +[data-md-color-scheme="slate"] .md-typeset code { + color: #FF8A75; } - -/* Bootstrap "table-striped" applied globally so individual tables in - user-guide pages don't need ":class: table-striped" added one by one. */ - -.table tbody tr:nth-of-type(odd) { - background-color: rgba(0, 0, 0, 0.05); +/* Hide notebook setup cells (chdir + boilerplate imports) so they don't + * appear as empty containers in the rendered output. The cells still + * execute — mkdocs-jupyter's TagRemovePreprocessor only strips input + * and outputs, leaving the wrapping div behind. */ +.celltag_nb-setup { + display: none !important; } -html[data-theme="dark"] .table tbody tr:nth-of-type(odd) { - background-color: rgba(255, 255, 255, 0.05); +/* markdown-exec emits an empty `
` for executable + * code blocks that produce no stdout. Hide them so pages don't show + * distracting empty boxes after assignment-only examples. */ +.result:empty { + display: none; } +/* Notebook code output (e.g. `df.show()` ASCII tables) often exceeds + * the content width. Force a non-wrapping pre with horizontal scroll + * so wide tables stay legible instead of wrapping mid-row. + * + * Material's `.md-typeset pre` rule sets `white-space: pre-wrap`, which + * would otherwise wrap each row across lines and destroy column + * alignment. */ +.jp-OutputArea-output, +.jp-OutputArea-output pre, +.jp-RenderedText, +.jp-RenderedText pre, +.md-typeset .jp-RenderedText pre, +.md-typeset .jp-OutputArea-output pre { + white-space: pre !important; + overflow-x: auto !important; + max-width: 100%; + word-wrap: normal; +} -/* Fix table text wrapping in RTD theme, - * see https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html - */ +/* Center the footer copyright/trademark block */ +.md-copyright { + text-align: center; +} -@media screen { - table.docutils td { - /* !important prevents the common CSS stylesheets from overriding - this as on RTD they are loaded after this stylesheet */ - white-space: normal !important; - } +.md-copyright__trademark { + margin-top: 0.4em; + opacity: 0.7; } diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html deleted file mode 100644 index d83d283c7..000000000 --- a/docs/source/_templates/layout.html +++ /dev/null @@ -1,22 +0,0 @@ -{% extends "pydata_sphinx_theme/layout.html" %} - - -{% block footer %} - -
-
- {% for footer_item in theme_footer_items %} - - {% endfor %} - -
-
- -{% endblock %} diff --git a/docs/source/_templates/sidebar-globaltoc.html b/docs/source/_templates/sidebar-globaltoc.html deleted file mode 100644 index f4aa2051f..000000000 --- a/docs/source/_templates/sidebar-globaltoc.html +++ /dev/null @@ -1,30 +0,0 @@ -{# Renders the global document toctree on every page (including the - landing page) with pydata-sphinx-theme's collapsible chevrons. - - The stock sidebar-nav-bs.html starts at the current section and is - stripped from the sidebar list by suppress_sidebar_toctree() on the - root page (no parent section). Using generate_toctree_html with - startdepth=0 renders the whole tree from root with the bootstrap - classes the theme's JS uses for expand/collapse toggles. Naming the - template "sidebar-globaltoc" sidesteps the suppress filter, which - matches on "sidebar-nav-bs.html" specifically. #} - diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index bb1473546..000000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,175 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Documentation generation.""" - -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - -# -- Project information ----------------------------------------------------- - -project = "Apache DataFusion in Python" -copyright = "2019-2026, Apache Software Foundation" -author = "Apache Software Foundation" - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.mathjax", - "sphinx.ext.napoleon", - "myst_parser", - "IPython.sphinxext.ipython_directive", - "autoapi.extension", -] - -source_suffix = { - ".rst": "restructuredtext", - ".md": "markdown", -} - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] - -autoapi_dirs = ["../../python"] -autoapi_ignore = ["*tests*"] -autoapi_member_order = "groupwise" -suppress_warnings = ["autoapi.python_import_resolution"] -autoapi_python_class_content = "both" -autoapi_keep_files = False # set to True for debugging generated files - - -def autoapi_skip_member_fn(app, what, name, obj, skip, options) -> bool: # noqa: ARG001 - skip_contents = [ - # Re-exports - ("class", "datafusion.DataFrame"), - ("class", "datafusion.SessionContext"), - ("module", "datafusion.common"), - # Duplicate modules (skip module-level docs to avoid duplication) - ("module", "datafusion.col"), - ("module", "datafusion.udf"), - # Deprecated - ("class", "datafusion.substrait.serde"), - ("class", "datafusion.substrait.plan"), - ("class", "datafusion.substrait.producer"), - ("class", "datafusion.substrait.consumer"), - ("method", "datafusion.context.SessionContext.tables"), - ("method", "datafusion.dataframe.DataFrame.unnest_column"), - ] - # Explicitly skip certain members listed above. These are either - # re-exports, duplicate module-level documentation, deprecated - # API surfaces, or private variables that would otherwise appear - # in the generated docs and cause confusing duplication. - # Keeping this explicit list avoids surprising entries in the - # AutoAPI output and gives us a single place to opt-out items - # when we intentionally hide them from the docs. - if (what, name) in skip_contents: - skip = True - - return skip - - -def setup(sphinx) -> None: - sphinx.connect("autoapi-skip-member", autoapi_skip_member_fn) - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "pydata_sphinx_theme" - -html_theme_options = { - "use_edit_page_button": False, - "show_toc_level": 2, - "logo": { - "image_light": "_static/images/original.svg", - "image_dark": "_static/images/original.svg", - "alt_text": "Apache DataFusion in Python", - }, - "navbar_start": ["navbar-logo"], - "navbar_center": ["navbar-nav"], - "navbar_end": ["navbar-icon-links", "theme-switcher"], - "icon_links": [ - { - "name": "GitHub", - "url": "https://github.com/apache/datafusion-python", - "icon": "fa-brands fa-github", - }, - { - "name": "Rust API docs (docs.rs)", - "url": "https://docs.rs/datafusion/latest/datafusion/", - "icon": "fa-brands fa-rust", - }, - ], - "secondary_sidebar_items": [], - "collapse_navigation": True, - "show_nav_level": 2, -} - -html_context = { - "github_user": "apache", - "github_repo": "datafusion-python", - "github_version": "main", - "doc_path": "docs/source", - "default_mode": "auto", -} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] - -html_favicon = "_static/favicon.svg" - -# Copy agent-facing files (llms.txt) verbatim to the site root so they -# resolve at conventional URLs like `https://.../python/llms.txt`. -html_extra_path = ["llms.txt"] - -html_css_files = ["theme_overrides.css"] - -html_sidebars = { - "**": ["sidebar-globaltoc.html"], -} - -# tell myst_parser to auto-generate anchor links for headers h1, h2, h3 -myst_heading_anchors = 3 - -# enable nice rendering of checkboxes for the task lists -myst_enable_extensions = ["tasklist"] diff --git a/docs/source/contributor-guide/ffi.md b/docs/source/contributor-guide/ffi.md new file mode 100644 index 000000000..ea3ca5b29 --- /dev/null +++ b/docs/source/contributor-guide/ffi.md @@ -0,0 +1,271 @@ + + + +# Python Extensions + +The DataFusion in Python project is designed to allow users to extend its functionality in a few core +areas. Ideally many users would like to package their extensions as a Python package and easily +integrate that package with this project. This page serves to describe some of the challenges we face +when doing these integrations and the approach our project uses. + +## The Primary Issue + +Suppose you wish to use DataFusion and you have a custom data source that can produce tables that +can then be queried against, similar to how you can register a [CSV](../user-guide/io/csv.md) or +[Parquet](../user-guide/io/parquet.md) file. In DataFusion terminology, you likely want to implement a +[Custom Table Provider](../user-guide/io/table_provider.md). In an effort to make your data source +as performant as possible and to utilize the features of DataFusion, you may decide to write +your source in Rust and then expose it through [PyO3](https://pyo3.rs) as a Python library. + +At first glance, it may appear the best way to do this is to add the `datafusion-python` +crate as a dependency, provide a `PyTable`, and then to register it with the +`SessionContext`. Unfortunately, this will not work. + +When you produce your code as a Python library and it needs to interact with the DataFusion +library, at the lowest level they communicate through an Application Binary Interface (ABI). +The acronym sounds similar to API (Application Programming Interface), but it is distinctly +different. + +The ABI sets the standard for how these libraries can share data and functions between each +other. One of the key differences between Rust and other programming languages is that Rust +does not have a stable ABI. What this means in practice is that if you compile a Rust library +with one version of the `rustc` compiler and I compile another library to interface with it +but I use a different version of the compiler, there is no guarantee the interface will be +the same. + +In practice, this means that a Python library built with `datafusion-python` as a Rust +dependency will generally **not** be compatible with the DataFusion Python package, even +if they reference the same version of `datafusion-python`. If you attempt to do this, it may +work on your local computer if you have built both packages with the same optimizations. +This can sometimes lead to a false expectation that the code will work, but it frequently +breaks the moment you try to use your package against the released packages. + +You can find more information about the Rust ABI in their +[online documentation](https://doc.rust-lang.org/reference/abi.html). + +## The FFI Approach + +Rust supports interacting with other programming languages through it's Foreign Function +Interface (FFI). The advantage of using the FFI is that it enables you to write data structures +and functions that have a stable ABI. The allows you to use Rust code with C, Python, and +other languages. In fact, the [PyO3](https://pyo3.rs) library uses the FFI to share data +and functions between Python and Rust. + +The approach we are taking in the DataFusion in Python project is to incrementally expose +more portions of the DataFusion project via FFI interfaces. This allows users to write Rust +code that does **not** require the `datafusion-python` crate as a dependency, expose their +code in Python via PyO3, and have it interact with the DataFusion Python package. + +Early adopters of this approach include [delta-rs](https://delta-io.github.io/delta-rs/) +who has adapted their Table Provider for use in `datafusion-python` with only a few lines +of code. Also, the DataFusion Python project uses the existing definitions from +[Apache Arrow CStream Interface](https://arrow.apache.org/docs/format/CStreamInterface.html) +to support importing **and** exporting tables. Any Python package that supports reading +the Arrow C Stream interface can work with DataFusion Python out of the box! You can read +more about working with Arrow sources in the [Data Sources](../user-guide/data-sources.md) +page. + +To learn more about the Foreign Function Interface in Rust, the +[Rustonomicon](https://doc.rust-lang.org/nomicon/ffi.html) is a good resource. + +## Inspiration from Arrow + +DataFusion is built upon [Apache Arrow](https://arrow.apache.org/). The canonical Python +Arrow implementation, [pyarrow](https://arrow.apache.org/docs/python/index.html) provides +an excellent way to share Arrow data between Python projects without performing any copy +operations on the data. They do this by using a well defined set of interfaces. You can +find the details about their stream interface +[here](https://arrow.apache.org/docs/format/CStreamInterface.html). The +[Rust Arrow Implementation](https://github.com/apache/arrow-rs) also supports these +`C` style definitions via the Foreign Function Interface. + +In addition to using these interfaces to transfer Arrow data between libraries, `pyarrow` +goes one step further to make sharing the interfaces easier in Python. They do this +by exposing PyCapsules that contain the expected functionality. + +You can learn more about PyCapsules from the official +[Python online documentation](https://docs.python.org/3/c-api/capsule.html). PyCapsules +have excellent support in PyO3 already. The +[PyO3 online documentation](https://pyo3.rs/main/doc/pyo3/types/struct.pycapsule) is a good source +for more details on using PyCapsules in Rust. + +Two lessons we leverage from the Arrow project in DataFusion Python are: + +- We reuse the existing Arrow FFI functionality wherever possible. +- We expose PyCapsules that contain a FFI stable struct. + +## Implementation Details + +The bulk of the code necessary to perform our FFI operations is in the upstream +[DataFusion](https://datafusion.apache.org/) core repository. You can review the code and +documentation in the [datafusion-ffi] crate. + +Our FFI implementation is narrowly focused at sharing data and functions with Rust backed +libraries. Starting in DataFusion 54.0.0 we use the +[stabby crate](https://crates.io/crates/stabby) (previously the +[abi_stable crate](https://crates.io/crates/abi_stable)). `stabby` provides +FFI-safe equivalents of common Rust types with a thinner runtime cost +and stricter ABI stability guarantees. For example, passing a +`Vec` across the FFI boundary is done via stabby's `Vec` / +`String` wrappers, and the crate also supplies FFI-safe analogues of +`Result` and `Option` that do not have an obvious translation to a +C equivalent. + +The [datafusion-ffi] crate has been designed to make it easy to convert from DataFusion +traits into their FFI counterparts. For example, if you have defined a custom +[TableProvider](https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html) +and you want to expose it through a PyCapsule, you can pull the logical +codec out of the calling session and hand both to `FFI_TableProvider`: + +```rust +use datafusion_ffi::table_provider::FFI_TableProvider; +use datafusion_python_util::ffi_logical_codec_from_pycapsule; + +let my_provider = MyTableProvider::default(); +let codec = ffi_logical_codec_from_pycapsule(session)?; +let ffi_provider = + FFI_TableProvider::new_with_ffi_codec(Arc::new(my_provider), false, None, codec); +``` + +See [`examples/datafusion-ffi-example/src/table_provider.rs`](https://github.com/apache/datafusion-python/blob/main/examples/datafusion-ffi-example/src/table_provider.rs) +for a complete runnable example. + + +## PyO3 class mutability guidelines + +PyO3 bindings should present immutable wrappers whenever a struct stores shared or +interior-mutable state. In practice this means that any `#[pyclass]` containing an +`Arc>` or similar synchronized primitive must opt into `#[pyclass(frozen)]` +unless there is a compelling reason not to. + +The execution context illustrates the preferred pattern. `PySessionContext` in +`src/context.rs` stays frozen even though it shares mutable state internally via +`SessionContext`. This ensures PyO3 tracks borrows correctly while Python-facing APIs +clone the inner `SessionContext` or return new wrappers instead of mutating the +existing instance in place: + +```rust +#[pyclass(from_py_object, frozen, name = "SessionContext", module = "datafusion", subclass)] +#[derive(Clone)] +pub struct PySessionContext { + pub ctx: SessionContext, +} +``` + +Occasionally a type must remain mutable—for example when PyO3 attribute setters need to +update fields directly. In these rare cases add an inline justification so reviewers and +future contributors understand why `frozen` is unsafe to enable. `DataTypeMap` in +`src/common/data_type.rs` includes such a comment because PyO3 still needs to track +field updates: + +```rust +// TODO: This looks like this needs pyo3 tracking so leaving unfrozen for now +#[derive(Debug, Clone)] +#[pyclass(from_py_object, name = "DataTypeMap", module = "datafusion.common", subclass)] +pub struct DataTypeMap { + #[pyo3(get, set)] + pub arrow_type: PyDataType, + #[pyo3(get, set)] + pub python_type: PythonType, + #[pyo3(get, set)] + pub sql_type: SqlType, +} +``` + +When reviewers encounter a mutable `#[pyclass]` without a comment, they should request +an explanation or ask that `frozen` be added. Keeping these wrappers frozen by default +helps avoid subtle bugs stemming from PyO3's interior mutability tracking. + +If you are interfacing with a library that provided the above `FFI_TableProvider` and +need a usable `TableProvider`, the `.into()` conversion now yields an +`Arc` directly: + +```rust +let provider: Arc = ffi_provider.into(); +``` + +(Older revisions of `datafusion-ffi` produced a `ForeignTableProvider` wrapper as an +intermediate; that step is no longer needed.) + +If you review the code in [datafusion-ffi] you will find that each of the traits we share +across the boundary has a struct prefixed with `FFI_`. This is the struct that lives on +the **provider** side of the FFI boundary — the code that has written the underlying +`TableProvider` implementation to access your custom data source. The receiver +(`datafusion-python`, in our case) consumes the `FFI_` struct through the FFI trait +implementations supplied by `datafusion-ffi`. + +In order to share these FFI structures, we need to wrap them in some kind of Python object +that can be used to interface from one package to another. As described in the above +section on our inspiration from Arrow, we use `PyCapsule`. We can create a `PyCapsule` +for our provider thusly: + +```rust +let name = cr"datafusion_table_provider".into(); +let my_capsule = PyCapsule::new(py, provider, Some(name))?; +``` + +On the receiving side, turn this pycapsule object into the `FFI_TableProvider`, then +convert directly to an `Arc`: + +```rust +let capsule = capsule.cast::()?; +let data: NonNull = capsule + .pointer_checked(Some(c"datafusion_table_provider"))? + .cast(); +let ffi_provider = unsafe { data.as_ref() }; +let provider: Arc = ffi_provider.into(); +``` + +By convention the `datafusion-python` library expects a Python object that has a +`TableProvider` PyCapsule to have this capsule accessible by calling a function named +`__datafusion_table_provider__`. You can see a complete working example of how to +share a `TableProvider` from one python library to DataFusion Python in the +[repository examples folder](https://github.com/apache/datafusion-python/tree/main/examples/datafusion-ffi-example). + +This section has been written using `TableProvider` as an example. It is the first +extension that has been written using this approach and the most thoroughly implemented. +As we continue to expose more of the DataFusion features, we intend to follow this same +design pattern. + +## Alternative Approach + +Suppose you needed to expose some other features of DataFusion and you could not wait +for the upstream repository to implement the FFI approach we describe. In this case +you decide to create your dependency on the `datafusion-python` crate instead. + +As we discussed, this is not guaranteed to work across different compiler versions and +optimization levels. If you wish to go down this route, there are two approaches we +have identified you can use. + +1. Re-export all of `datafusion-python` yourself with your extensions built in. +2. Carefully synchronize your software releases with the `datafusion-python` CI build + system so that your libraries use the exact same compiler, features, and + optimization level. + +We currently do not recommend either of these approaches as they are difficult to +maintain over a long period. Additionally, they require a tight version coupling +between libraries. + +## Status of Work + +At the time of this writing, the FFI features are under active development. To see +the latest status, we recommend reviewing the code in the [datafusion-ffi] crate. + +[datafusion-ffi]: https://crates.io/crates/datafusion-ffi diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst deleted file mode 100644 index c89b99849..000000000 --- a/docs/source/contributor-guide/ffi.rst +++ /dev/null @@ -1,265 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _ffi: - -Python Extensions -================= - -The DataFusion in Python project is designed to allow users to extend its functionality in a few core -areas. Ideally many users would like to package their extensions as a Python package and easily -integrate that package with this project. This page serves to describe some of the challenges we face -when doing these integrations and the approach our project uses. - -The Primary Issue ------------------ - -Suppose you wish to use DataFusion and you have a custom data source that can produce tables that -can then be queried against, similar to how you can register a :ref:`CSV ` or -:ref:`Parquet ` file. In DataFusion terminology, you likely want to implement a -:ref:`Custom Table Provider `. In an effort to make your data source -as performant as possible and to utilize the features of DataFusion, you may decide to write -your source in Rust and then expose it through `PyO3 `_ as a Python library. - -At first glance, it may appear the best way to do this is to add the ``datafusion-python`` -crate as a dependency, provide a ``PyTable``, and then to register it with the -``SessionContext``. Unfortunately, this will not work. - -When you produce your code as a Python library and it needs to interact with the DataFusion -library, at the lowest level they communicate through an Application Binary Interface (ABI). -The acronym sounds similar to API (Application Programming Interface), but it is distinctly -different. - -The ABI sets the standard for how these libraries can share data and functions between each -other. One of the key differences between Rust and other programming languages is that Rust -does not have a stable ABI. What this means in practice is that if you compile a Rust library -with one version of the ``rustc`` compiler and I compile another library to interface with it -but I use a different version of the compiler, there is no guarantee the interface will be -the same. - -In practice, this means that a Python library built with ``datafusion-python`` as a Rust -dependency will generally **not** be compatible with the DataFusion Python package, even -if they reference the same version of ``datafusion-python``. If you attempt to do this, it may -work on your local computer if you have built both packages with the same optimizations. -This can sometimes lead to a false expectation that the code will work, but it frequently -breaks the moment you try to use your package against the released packages. - -You can find more information about the Rust ABI in their -`online documentation `_. - -The FFI Approach ----------------- - -Rust supports interacting with other programming languages through it's Foreign Function -Interface (FFI). The advantage of using the FFI is that it enables you to write data structures -and functions that have a stable ABI. The allows you to use Rust code with C, Python, and -other languages. In fact, the `PyO3 `_ library uses the FFI to share data -and functions between Python and Rust. - -The approach we are taking in the DataFusion in Python project is to incrementally expose -more portions of the DataFusion project via FFI interfaces. This allows users to write Rust -code that does **not** require the ``datafusion-python`` crate as a dependency, expose their -code in Python via PyO3, and have it interact with the DataFusion Python package. - -Early adopters of this approach include `delta-rs `_ -who has adapted their Table Provider for use in ```datafusion-python``` with only a few lines -of code. Also, the DataFusion Python project uses the existing definitions from -`Apache Arrow CStream Interface `_ -to support importing **and** exporting tables. Any Python package that supports reading -the Arrow C Stream interface can work with DataFusion Python out of the box! You can read -more about working with Arrow sources in the :ref:`Data Sources ` -page. - -To learn more about the Foreign Function Interface in Rust, the -`Rustonomicon `_ is a good resource. - -Inspiration from Arrow ----------------------- - -DataFusion is built upon `Apache Arrow `_. The canonical Python -Arrow implementation, `pyarrow `_ provides -an excellent way to share Arrow data between Python projects without performing any copy -operations on the data. They do this by using a well defined set of interfaces. You can -find the details about their stream interface -`here `_. The -`Rust Arrow Implementation `_ also supports these -``C`` style definitions via the Foreign Function Interface. - -In addition to using these interfaces to transfer Arrow data between libraries, ``pyarrow`` -goes one step further to make sharing the interfaces easier in Python. They do this -by exposing PyCapsules that contain the expected functionality. - -You can learn more about PyCapsules from the official -`Python online documentation `_. PyCapsules -have excellent support in PyO3 already. The -`PyO3 online documentation `_ is a good source -for more details on using PyCapsules in Rust. - -Two lessons we leverage from the Arrow project in DataFusion Python are: - -- We reuse the existing Arrow FFI functionality wherever possible. -- We expose PyCapsules that contain a FFI stable struct. - -Implementation Details ----------------------- - -The bulk of the code necessary to perform our FFI operations is in the upstream -`DataFusion `_ core repository. You can review the code and -documentation in the `datafusion-ffi`_ crate. - -Our FFI implementation is narrowly focused at sharing data and functions with Rust backed -libraries. This allows us to use the `abi_stable crate `_. -This is an excellent crate that allows for easy conversion between Rust native types -and FFI-safe alternatives. For example, if you needed to pass a ``Vec`` via FFI, -you can simply convert it to a ``RVec`` in an intuitive manner. It also supports -features like ``RResult`` and ``ROption`` that do not have an obvious translation to a -C equivalent. - -The `datafusion-ffi`_ crate has been designed to make it easy to convert from DataFusion -traits into their FFI counterparts. For example, if you have defined a custom -`TableProvider `_ -and you want to create a sharable FFI counterpart, you could write: - -.. code-block:: rust - - let my_provider = MyTableProvider::default(); - let ffi_provider = FFI_TableProvider::new(Arc::new(my_provider), false, None); - -.. _ffi_pyclass_mutability: - -PyO3 class mutability guidelines --------------------------------- - -PyO3 bindings should present immutable wrappers whenever a struct stores shared or -interior-mutable state. In practice this means that any ``#[pyclass]`` containing an -``Arc>`` or similar synchronized primitive must opt into ``#[pyclass(frozen)]`` -unless there is a compelling reason not to. - -The execution context illustrates the preferred pattern. ``PySessionContext`` in -:file:`src/context.rs` stays frozen even though it shares mutable state internally via -``SessionContext``. This ensures PyO3 tracks borrows correctly while Python-facing APIs -clone the inner ``SessionContext`` or return new wrappers instead of mutating the -existing instance in place: - -.. code-block:: rust - - #[pyclass(from_py_object, frozen, name = "SessionContext", module = "datafusion", subclass)] - #[derive(Clone)] - pub struct PySessionContext { - pub ctx: SessionContext, - } - -Occasionally a type must remain mutable—for example when PyO3 attribute setters need to -update fields directly. In these rare cases add an inline justification so reviewers and -future contributors understand why ``frozen`` is unsafe to enable. ``DataTypeMap`` in -:file:`src/common/data_type.rs` includes such a comment because PyO3 still needs to track -field updates: - -.. code-block:: rust - - // TODO: This looks like this needs pyo3 tracking so leaving unfrozen for now - #[derive(Debug, Clone)] - #[pyclass(from_py_object, name = "DataTypeMap", module = "datafusion.common", subclass)] - pub struct DataTypeMap { - #[pyo3(get, set)] - pub arrow_type: PyDataType, - #[pyo3(get, set)] - pub python_type: PythonType, - #[pyo3(get, set)] - pub sql_type: SqlType, - } - -When reviewers encounter a mutable ``#[pyclass]`` without a comment, they should request -an explanation or ask that ``frozen`` be added. Keeping these wrappers frozen by default -helps avoid subtle bugs stemming from PyO3's interior mutability tracking. - -If you were interfacing with a library that provided the above ``FFI_TableProvider`` and -you needed to turn it back into an ``TableProvider``, you can turn it into a -``ForeignTableProvider`` with implements the ``TableProvider`` trait. - -.. code-block:: rust - - let foreign_provider: ForeignTableProvider = ffi_provider.into(); - -If you review the code in `datafusion-ffi`_ you will find that each of the traits we share -across the boundary has two portions, one with a ``FFI_`` prefix and one with a ``Foreign`` -prefix. This is used to distinguish which side of the FFI boundary that struct is -designed to be used on. The structures with the ``FFI_`` prefix are to be used on the -**provider** of the structure. In the example we're showing, this means the code that has -written the underlying ``TableProvider`` implementation to access your custom data source. -The structures with the ``Foreign`` prefix are to be used by the receiver. In this case, -it is the ``datafusion-python`` library. - -In order to share these FFI structures, we need to wrap them in some kind of Python object -that can be used to interface from one package to another. As described in the above -section on our inspiration from Arrow, we use ``PyCapsule``. We can create a ``PyCapsule`` -for our provider thusly: - -.. code-block:: rust - - let name = CString::new("datafusion_table_provider")?; - let my_capsule = PyCapsule::new_bound(py, provider, Some(name))?; - -On the receiving side, turn this pycapsule object into the ``FFI_TableProvider``, which -can then be turned into a ``ForeignTableProvider`` the associated code is: - -.. code-block:: rust - - let capsule = capsule.cast::()?; - let data: NonNull = capsule - .pointer_checked(Some(name))? - .cast(); - let codec = unsafe { data.as_ref() }; - -By convention the ``datafusion-python`` library expects a Python object that has a -``TableProvider`` PyCapsule to have this capsule accessible by calling a function named -``__datafusion_table_provider__``. You can see a complete working example of how to -share a ``TableProvider`` from one python library to DataFusion Python in the -`repository examples folder `_. - -This section has been written using ``TableProvider`` as an example. It is the first -extension that has been written using this approach and the most thoroughly implemented. -As we continue to expose more of the DataFusion features, we intend to follow this same -design pattern. - -Alternative Approach --------------------- - -Suppose you needed to expose some other features of DataFusion and you could not wait -for the upstream repository to implement the FFI approach we describe. In this case -you decide to create your dependency on the ``datafusion-python`` crate instead. - -As we discussed, this is not guaranteed to work across different compiler versions and -optimization levels. If you wish to go down this route, there are two approaches we -have identified you can use. - -#. Re-export all of ``datafusion-python`` yourself with your extensions built in. -#. Carefully synchronize your software releases with the ``datafusion-python`` CI build - system so that your libraries use the exact same compiler, features, and - optimization level. - -We currently do not recommend either of these approaches as they are difficult to -maintain over a long period. Additionally, they require a tight version coupling -between libraries. - -Status of Work --------------- - -At the time of this writing, the FFI features are under active development. To see -the latest status, we recommend reviewing the code in the `datafusion-ffi`_ crate. - -.. _datafusion-ffi: https://crates.io/crates/datafusion-ffi diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md new file mode 100644 index 000000000..6fcd86017 --- /dev/null +++ b/docs/source/contributor-guide/index.md @@ -0,0 +1,30 @@ + + +# Contributor Guide + +Guides for contributors to the DataFusion in Python project. + +## Contents + +- [Introduction](introduction.md) — workflow, code layout, how to run + the test suite, how PRs are reviewed. +- [FFI](ffi.md) — exposing Rust-backed extensions through the Foreign + Function Interface so they appear as first-class DataFusion symbols + in Python. diff --git a/docs/source/contributor-guide/index.rst b/docs/source/contributor-guide/index.rst deleted file mode 100644 index b32e08878..000000000 --- a/docs/source/contributor-guide/index.rst +++ /dev/null @@ -1,28 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -================= -Contributor Guide -================= - -Guides for contributors to the DataFusion in Python project. - -.. toctree:: - :maxdepth: 2 - - introduction - ffi diff --git a/docs/source/contributor-guide/introduction.md b/docs/source/contributor-guide/introduction.md new file mode 100644 index 000000000..44bae72ed --- /dev/null +++ b/docs/source/contributor-guide/introduction.md @@ -0,0 +1,149 @@ + + +# Introduction + +We welcome and encourage contributions of all kinds, such as: + +1. Tickets with issue reports of feature requests +2. Documentation improvements +3. Code, both PR and (especially) PR Review. + +In addition to submitting new PRs, we have a healthy tradition of community members reviewing each other’s PRs. +Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases. + +Before opening a pull request that touches PyO3 bindings, please review the +[PyO3 class mutability guidelines](ffi.md#pyo3-class-mutability-guidelines) so you can flag missing +`#[pyclass(frozen)]` annotations during development and review. + +## How to develop + +This assumes that you have rust and cargo installed. We use the workflow recommended by +[pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). We recommend using +[uv](https://docs.astral.sh/uv/) for python package management. + +By default `uv` will attempt to build the datafusion python package. For our development we prefer to build manually. This means +that when creating your virtual environment using `uv sync` you need to pass in the additional `--no-install-package datafusion` +and for `uv run` commands the additional parameter `--no-project` + +Bootstrap: + +```shell +# fetch this repo +git clone git@github.com:apache/datafusion-python.git +# create the virtual environment +uv sync --dev --no-install-package datafusion +# activate the environment +source .venv/bin/activate +``` + +The tests rely on test data in git submodules. + +```shell +git submodule init +git submodule update +``` + +Whenever rust code changes (your changes or via `git pull`): + +```shell +# make sure you activate the venv using "source .venv/bin/activate" first +maturin develop --uv +python -m pytest +``` + +## Running & Installing pre-commit hooks + +arrow-datafusion-python takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the developer but certainly helpful for keeping PRs clean and concise. + +Our pre-commit hooks can be installed by running `pre-commit install`, which will install the configurations in your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete the commit if an offending lint is found allowing you to make changes locally before pushing. + +The pre-commit hooks can also be run adhoc without installing them by simply running `pre-commit run --all-files` + +## Guidelines for Separating Python and Rust Code + +Version 40 of `datafusion-python` introduced `python` wrappers around the `pyo3` generated code to vastly improve the user experience. (See the [blog post](https://datafusion.apache.org/blog/2024/08/20/python-datafusion-40.0.0/) and [pull request](https://github.com/apache/datafusion-python/pull/750) for more details.) + +Mostly, the `python` code is limited to pure wrappers with type hints and good docstrings, but there are a few reasons for when the code does more: + +1. Trivial aliases like [`array_append`][datafusion.functions.array_append] and [`list_append`][datafusion.functions.list_append]. +2. Simple type conversion, like from a `path` to a `string` of the path or from `number` to `lit(number)`. +3. The additional code makes an API **much** more pythonic, like we do for [`named_struct`][datafusion.functions.named_struct] (see [source code](https://github.com/apache/datafusion-python/blob/a0913c728f5f323c1eb4913e614c9d996083e274/python/datafusion/functions.py#L1040-L1046)). + +## Update Dependencies + +To change test dependencies, change the `pyproject.toml` and run + +To update dependencies, run + +```shell +uv sync --dev --no-install-package datafusion +``` + +## Improving Build Speed + +The [pyo3](https://github.com/PyO3/pyo3) dependency of this project contains a `build.rs` file which +can cause it to rebuild frequently. You can prevent this from happening by defining a `PYO3_CONFIG_FILE` +environment variable that points to a file with your build configuration. Whenever your build configuration +changes, such as during some major version updates, you will need to regenerate this file. This variable +should point to a fully resolved path on your build machine. + +To generate this file, use the following command: + +```shell +PYO3_PRINT_CONFIG=1 cargo build +``` + +This will generate some output that looks like the following. You will want to copy these contents intro +a file. If you place this file in your project directory with filename `.pyo3_build_config` it will +be ignored by `git`. + +``` +implementation=CPython +version=3.9 +shared=true +abi3=true +lib_name=python3.12 +lib_dir=/opt/homebrew/opt/python@3.12/Frameworks/Python.framework/Versions/3.12/lib +executable=/Users/myusername/src/datafusion-python/.venv/bin/python +pointer_width=64 +build_flags= +suppress_build_script_link_lines=false +``` + +Add the environment variable to your system. + +```shell +export PYO3_CONFIG_FILE="/Users//myusername/src/datafusion-python/.pyo3_build_config" +``` + +If you are on a Mac and you use VS Code for your IDE, you will want to add these variables +to your settings. You can find the appropriate rust flags by looking in the +`.cargo/config.toml` file. + +``` +"rust-analyzer.cargo.extraEnv": { + "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup", + "PYO3_CONFIG_FILE": "/Users/myusername/src/datafusion-python/.pyo3_build_config" +}, +"rust-analyzer.runnables.extraEnv": { + "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup", + "PYO3_CONFIG_FILE": "/Users/myusername/src/personal/datafusion-python/.pyo3_build_config" +} +``` diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst deleted file mode 100644 index 33c2b274c..000000000 --- a/docs/source/contributor-guide/introduction.rst +++ /dev/null @@ -1,154 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Introduction -============ -We welcome and encourage contributions of all kinds, such as: - -1. Tickets with issue reports of feature requests -2. Documentation improvements -3. Code, both PR and (especially) PR Review. - -In addition to submitting new PRs, we have a healthy tradition of community members reviewing each other’s PRs. -Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases. - -Before opening a pull request that touches PyO3 bindings, please review the -:ref:`PyO3 class mutability guidelines ` so you can flag missing -``#[pyclass(frozen)]`` annotations during development and review. - -How to develop --------------- - -This assumes that you have rust and cargo installed. We use the workflow recommended by -`pyo3 `_ and `maturin `_. We recommend using -`uv `_ for python package management. - -By default `uv` will attempt to build the datafusion python package. For our development we prefer to build manually. This means -that when creating your virtual environment using `uv sync` you need to pass in the additional `--no-install-package datafusion` -and for `uv run` commands the additional parameter `--no-project` - -Bootstrap: - -.. code-block:: shell - - # fetch this repo - git clone git@github.com:apache/datafusion-python.git - # create the virtual environment - uv sync --dev --no-install-package datafusion - # activate the environment - source .venv/bin/activate - -The tests rely on test data in git submodules. - -.. code-block:: shell - - git submodule init - git submodule update - - -Whenever rust code changes (your changes or via `git pull`): - -.. code-block:: shell - - # make sure you activate the venv using "source .venv/bin/activate" first - maturin develop -uv - python -m pytest - -Running & Installing pre-commit hooks -------------------------------------- - -arrow-datafusion-python takes advantage of `pre-commit `_ to assist developers with code linting to help reduce the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the developer but certainly helpful for keeping PRs clean and concise. - -Our pre-commit hooks can be installed by running :code:`pre-commit install`, which will install the configurations in your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete the commit if an offending lint is found allowing you to make changes locally before pushing. - -The pre-commit hooks can also be run adhoc without installing them by simply running :code:`pre-commit run --all-files` - -Guidelines for Separating Python and Rust Code ----------------------------------------------- - -Version 40 of ``datafusion-python`` introduced ``python`` wrappers around the ``pyo3`` generated code to vastly improve the user experience. (See the `blog post `_ and `pull request `_ for more details.) - -Mostly, the ``python`` code is limited to pure wrappers with type hints and good docstrings, but there are a few reasons for when the code does more: - -1. Trivial aliases like :py:func:`~datafusion.functions.array_append` and :py:func:`~datafusion.functions.list_append`. -2. Simple type conversion, like from a ``path`` to a ``string`` of the path or from ``number`` to ``lit(number)``. -3. The additional code makes an API **much** more pythonic, like we do for :py:func:`~datafusion.functions.named_struct` (see `source code `_). - - -Update Dependencies -------------------- - -To change test dependencies, change the ``pyproject.toml`` and run - -To update dependencies, run - -.. code-block:: shell - - uv sync --dev --no-install-package datafusion - -Improving Build Speed ---------------------- - -The `pyo3 `_ dependency of this project contains a ``build.rs`` file which -can cause it to rebuild frequently. You can prevent this from happening by defining a ``PYO3_CONFIG_FILE`` -environment variable that points to a file with your build configuration. Whenever your build configuration -changes, such as during some major version updates, you will need to regenerate this file. This variable -should point to a fully resolved path on your build machine. - -To generate this file, use the following command: - -.. code-block:: shell - - PYO3_PRINT_CONFIG=1 cargo build - -This will generate some output that looks like the following. You will want to copy these contents intro -a file. If you place this file in your project directory with filename ``.pyo3_build_config`` it will -be ignored by ``git``. - -.. code-block:: - - implementation=CPython - version=3.9 - shared=true - abi3=true - lib_name=python3.12 - lib_dir=/opt/homebrew/opt/python@3.12/Frameworks/Python.framework/Versions/3.12/lib - executable=/Users/myusername/src/datafusion-python/.venv/bin/python - pointer_width=64 - build_flags= - suppress_build_script_link_lines=false - -Add the environment variable to your system. - -.. code-block:: shell - - export PYO3_CONFIG_FILE="/Users//myusername/src/datafusion-python/.pyo3_build_config" - -If you are on a Mac and you use VS Code for your IDE, you will want to add these variables -to your settings. You can find the appropriate rust flags by looking in the -``.cargo/config.toml`` file. - -.. code-block:: - - "rust-analyzer.cargo.extraEnv": { - "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup", - "PYO3_CONFIG_FILE": "/Users/myusername/src/datafusion-python/.pyo3_build_config" - }, - "rust-analyzer.runnables.extraEnv": { - "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup", - "PYO3_CONFIG_FILE": "/Users/myusername/src/personal/datafusion-python/.pyo3_build_config" - } diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 000000000..de08dcdfa --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,48 @@ + + +# DataFusion in Python + +This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/datafusion). + +Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python. + +It also allows you to use UDFs and UDAFs for complex operations. + +The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations. + +Its query engine, DataFusion, is written in [Rust](https://www.rust-lang.org), which makes strong assumptions about thread safety and lack of memory leaks. + +Technically, zero-copy is achieved via the [c data interface](https://arrow.apache.org/docs/format/CDataInterface.html). + +## Install + +```shell +pip install datafusion +``` + +## Example + +```python exec="1" source="material-block" result="text" session="index" +ctx = SessionContext() + +df = ctx.read_csv("pokemon.csv") + +df.show() +``` diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 6b72537da..000000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -==================== -DataFusion in Python -==================== - -This is a Python library that binds to `Apache Arrow `_ in-memory query engine `DataFusion `_. - -Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python. - -It also allows you to use UDFs and UDAFs for complex operations. - -The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations. - -Its query engine, DataFusion, is written in `Rust `_, which makes strong assumptions about thread safety and lack of memory leaks. - -Technically, zero-copy is achieved via the `c data interface `_. - -Install -------- - -.. code-block:: shell - - pip install datafusion - -Example -------- - -.. ipython:: python - - from datafusion import SessionContext - - ctx = SessionContext() - - df = ctx.read_csv("pokemon.csv") - - df.show() - - -.. toctree:: - :hidden: - :maxdepth: 1 - - user-guide/index - contributor-guide/index - API Reference - links diff --git a/docs/source/links.md b/docs/source/links.md new file mode 100644 index 000000000..c7dc360c1 --- /dev/null +++ b/docs/source/links.md @@ -0,0 +1,27 @@ + + +# Links + +External resources for the DataFusion in Python project. + +- [GitHub and Issue Tracker](https://github.com/apache/datafusion-python) +- [Rust API Docs](https://docs.rs/datafusion/latest/datafusion/) +- [Code of Conduct](https://github.com/apache/datafusion/blob/main/CODE_OF_CONDUCT.md) +- [Examples](https://github.com/apache/datafusion-python/tree/main/examples) diff --git a/docs/source/links.rst b/docs/source/links.rst deleted file mode 100644 index 10473f31b..000000000 --- a/docs/source/links.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -===== -Links -===== - -External resources for the DataFusion in Python project. - -.. toctree:: - :maxdepth: 1 - - GitHub and Issue Tracker - Rust API Docs - Code of Conduct - Examples diff --git a/docs/source/reference/datafusion/catalog.md b/docs/source/reference/datafusion/catalog.md new file mode 100644 index 000000000..a533f6cf2 --- /dev/null +++ b/docs/source/reference/datafusion/catalog.md @@ -0,0 +1,25 @@ +# Catalog + +::: datafusion.catalog.Catalog + +::: datafusion.catalog.CatalogProvider + +::: datafusion.catalog.CatalogProviderExportable + +::: datafusion.catalog.CatalogList + +::: datafusion.catalog.CatalogProviderList + +::: datafusion.catalog.CatalogProviderListExportable + +::: datafusion.catalog.Schema + +::: datafusion.catalog.SchemaProvider + +::: datafusion.catalog.SchemaProviderExportable + +::: datafusion.catalog.Table + +::: datafusion.catalog.TableProviderFactory + +::: datafusion.catalog.TableProviderFactoryExportable diff --git a/docs/source/reference/datafusion/common.md b/docs/source/reference/datafusion/common.md new file mode 100644 index 000000000..8b69e8e01 --- /dev/null +++ b/docs/source/reference/datafusion/common.md @@ -0,0 +1,3 @@ +# Common + +::: datafusion.common.DFSchema diff --git a/docs/source/reference/datafusion/context.md b/docs/source/reference/datafusion/context.md new file mode 100644 index 000000000..dbbe27bfc --- /dev/null +++ b/docs/source/reference/datafusion/context.md @@ -0,0 +1,21 @@ +# SessionContext + +::: datafusion.context.SessionContext + options: + filters: + - "!^_" + - "!^tables$" + +::: datafusion.context.SessionConfig + +::: datafusion.context.SQLOptions + +::: datafusion.context.RuntimeEnvBuilder + +::: datafusion.context.ArrowStreamExportable + +::: datafusion.context.ArrowArrayExportable + +::: datafusion.context.TableProviderExportable + +::: datafusion.context.PhysicalOptimizerRuleExportable diff --git a/docs/source/reference/datafusion/dataframe.md b/docs/source/reference/datafusion/dataframe.md new file mode 100644 index 000000000..e2db4f2c0 --- /dev/null +++ b/docs/source/reference/datafusion/dataframe.md @@ -0,0 +1,27 @@ +# DataFrame + +::: datafusion.dataframe.DataFrame + options: + filters: + - "!^_" + - "!^unnest_column$" + +::: datafusion.dataframe.DataFrameWriteOptions + +::: datafusion.dataframe.ParquetWriterOptions + +::: datafusion.dataframe.ParquetColumnOptions + +::: datafusion.dataframe.InsertOp + +::: datafusion.dataframe.ExplainFormat + +::: datafusion.dataframe.Compression + +## DataFrame Formatter + +See [DataFrame Formatter](dataframe_formatter.md) for the full formatter API +([`configure_formatter`][datafusion.dataframe_formatter.configure_formatter], +[`DataFrameHtmlFormatter`][datafusion.dataframe_formatter.DataFrameHtmlFormatter], +[`CellFormatter`][datafusion.dataframe_formatter.CellFormatter], +[`StyleProvider`][datafusion.dataframe_formatter.StyleProvider], etc.). diff --git a/docs/source/reference/datafusion/dataframe_formatter.md b/docs/source/reference/datafusion/dataframe_formatter.md new file mode 100644 index 000000000..d9530586f --- /dev/null +++ b/docs/source/reference/datafusion/dataframe_formatter.md @@ -0,0 +1,23 @@ +# DataFrame Formatter + +The `datafusion.dataframe_formatter` module controls how DataFrames render +in notebooks and HTML contexts. See the user-guide +[Rendering](../../user-guide/dataframe/rendering.md) page for worked examples. + +::: datafusion.dataframe_formatter.configure_formatter + +::: datafusion.dataframe_formatter.get_formatter + +::: datafusion.dataframe_formatter.set_formatter + +::: datafusion.dataframe_formatter.reset_formatter + +::: datafusion.dataframe_formatter.DataFrameHtmlFormatter + +::: datafusion.dataframe_formatter.CellFormatter + +::: datafusion.dataframe_formatter.StyleProvider + +::: datafusion.dataframe_formatter.DefaultStyleProvider + +::: datafusion.dataframe_formatter.FormatterManager diff --git a/docs/source/reference/datafusion/expr.md b/docs/source/reference/datafusion/expr.md new file mode 100644 index 000000000..d855cd939 --- /dev/null +++ b/docs/source/reference/datafusion/expr.md @@ -0,0 +1,9 @@ +# Expr + +::: datafusion.expr.Expr + +::: datafusion.expr.WindowFrame + +::: datafusion.expr.CaseBuilder + +::: datafusion.expr.GroupingSet diff --git a/docs/source/reference/datafusion/functions.md b/docs/source/reference/datafusion/functions.md new file mode 100644 index 000000000..17c3f0e1a --- /dev/null +++ b/docs/source/reference/datafusion/functions.md @@ -0,0 +1,10 @@ +# Functions + +The `datafusion.functions` module provides 290+ scalar, aggregate, and window +functions. Import as: + +```python +from datafusion import functions as F +``` + +::: datafusion.functions diff --git a/docs/source/reference/datafusion/index.md b/docs/source/reference/datafusion/index.md new file mode 100644 index 000000000..eeaa0663c --- /dev/null +++ b/docs/source/reference/datafusion/index.md @@ -0,0 +1,71 @@ + + +# datafusion + +::: datafusion + options: + members: false + +## Submodules + +| Module | Description | +| --- | --- | +| [`catalog`](catalog.md) | Catalog, schema, and table providers | +| [`common`](common.md) | Common types shared across the API | +| [`context`](context.md) | `SessionContext`, session config, and runtime | +| [`dataframe`](dataframe.md) | `DataFrame` query builder and write options | +| [`dataframe_formatter`](dataframe_formatter.md) | HTML/text rendering for DataFrames | +| [`expr`](expr.md) | Expression tree (`Expr`, window frames, grouping sets) | +| [`functions`](functions.md) | 290+ built-in scalar, aggregate, and window functions | +| [`input`](input.md) | Input source plugins | +| [`io`](io.md) | `read_csv`, `read_parquet`, `read_json`, `read_avro` | +| [`ipc`](ipc.md) | Arrow IPC serialization for DataFrames and expressions | +| [`object_store`](object_store.md) | Object store backends (S3, GCS, Azure, local) | +| [`options`](options.md) | Read-option configuration types | +| [`plan`](plan.md) | Logical and physical plan introspection | +| [`record_batch`](record_batch.md) | `RecordBatch` and `RecordBatchStream` | +| [`substrait`](substrait.md) | Substrait plan serialization | +| [`unparser`](unparser.md) | Convert logical plans back to SQL | +| [`user_defined`](user_defined.md) | User-defined scalar, aggregate, window, and table functions | + +## Top-level names + +These names live on the `datafusion` package itself and are imported as +`from datafusion import `. + +### Column builders + +::: datafusion.col.col + +::: datafusion.col.column + +### Literal builders + +::: datafusion.lit + +::: datafusion.literal + +::: datafusion.string_literal + +::: datafusion.str_lit + +::: datafusion.literal_with_metadata + +::: datafusion.lit_with_metadata diff --git a/docs/source/reference/datafusion/input.md b/docs/source/reference/datafusion/input.md new file mode 100644 index 000000000..88f2528f2 --- /dev/null +++ b/docs/source/reference/datafusion/input.md @@ -0,0 +1,28 @@ + + +# input + +::: datafusion.input + options: + members: false + +::: datafusion.input.base + +::: datafusion.input.location diff --git a/docs/source/reference/datafusion/io.md b/docs/source/reference/datafusion/io.md new file mode 100644 index 000000000..01154fc75 --- /dev/null +++ b/docs/source/reference/datafusion/io.md @@ -0,0 +1,11 @@ +# I/O + +Top-level reader functions for loading data into a DataFrame. + +::: datafusion.io.read_csv + +::: datafusion.io.read_parquet + +::: datafusion.io.read_json + +::: datafusion.io.read_avro diff --git a/docs/source/reference/datafusion/ipc.md b/docs/source/reference/datafusion/ipc.md new file mode 100644 index 000000000..e8e1297bf --- /dev/null +++ b/docs/source/reference/datafusion/ipc.md @@ -0,0 +1,3 @@ +# IPC + +::: datafusion.ipc diff --git a/docs/source/reference/datafusion/object_store.md b/docs/source/reference/datafusion/object_store.md new file mode 100644 index 000000000..7012c1482 --- /dev/null +++ b/docs/source/reference/datafusion/object_store.md @@ -0,0 +1,15 @@ +# Object Store + +::: datafusion.object_store + options: + members: false + +::: datafusion.object_store.AmazonS3 + +::: datafusion.object_store.GoogleCloud + +::: datafusion.object_store.Http + +::: datafusion.object_store.LocalFileSystem + +::: datafusion.object_store.MicrosoftAzure diff --git a/docs/source/reference/datafusion/options.md b/docs/source/reference/datafusion/options.md new file mode 100644 index 000000000..6bdd966b4 --- /dev/null +++ b/docs/source/reference/datafusion/options.md @@ -0,0 +1,3 @@ +# Options + +::: datafusion.options.CsvReadOptions diff --git a/docs/source/reference/datafusion/plan.md b/docs/source/reference/datafusion/plan.md new file mode 100644 index 000000000..3e2d48c45 --- /dev/null +++ b/docs/source/reference/datafusion/plan.md @@ -0,0 +1,9 @@ +# Plan + +::: datafusion.plan.LogicalPlan + +::: datafusion.plan.ExecutionPlan + +::: datafusion.plan.Metric + +::: datafusion.plan.MetricsSet diff --git a/docs/source/reference/datafusion/record_batch.md b/docs/source/reference/datafusion/record_batch.md new file mode 100644 index 000000000..c02b786ef --- /dev/null +++ b/docs/source/reference/datafusion/record_batch.md @@ -0,0 +1,10 @@ +# RecordBatch + +::: datafusion.record_batch.RecordBatch + +::: datafusion.record_batch.RecordBatchStream + options: + filters: + - "!^_" + - "^__next__$" + - "^__anext__$" diff --git a/docs/source/reference/datafusion/substrait.md b/docs/source/reference/datafusion/substrait.md new file mode 100644 index 000000000..f615bb463 --- /dev/null +++ b/docs/source/reference/datafusion/substrait.md @@ -0,0 +1,10 @@ +# Substrait + +::: datafusion.substrait + options: + filters: + - "!^_" + - "!^serde$" + - "!^plan$" + - "!^producer$" + - "!^consumer$" diff --git a/docs/source/reference/datafusion/unparser.md b/docs/source/reference/datafusion/unparser.md new file mode 100644 index 000000000..c26b0d403 --- /dev/null +++ b/docs/source/reference/datafusion/unparser.md @@ -0,0 +1,3 @@ +# Unparser + +::: datafusion.unparser diff --git a/docs/source/reference/datafusion/user_defined.md b/docs/source/reference/datafusion/user_defined.md new file mode 100644 index 000000000..99e8889ad --- /dev/null +++ b/docs/source/reference/datafusion/user_defined.md @@ -0,0 +1,33 @@ +# User-Defined Functions + +::: datafusion.user_defined.Volatility + +::: datafusion.user_defined.ScalarUDF + +::: datafusion.user_defined.AggregateUDF + +::: datafusion.user_defined.WindowUDF + +::: datafusion.user_defined.TableFunction + +::: datafusion.user_defined.Accumulator + +::: datafusion.user_defined.WindowEvaluator + +::: datafusion.user_defined.udf + +::: datafusion.user_defined.udaf + +::: datafusion.user_defined.udwf + +::: datafusion.user_defined.udtf + +::: datafusion.user_defined.ScalarUDFExportable + +::: datafusion.user_defined.AggregateUDFExportable + +::: datafusion.user_defined.WindowUDFExportable + +::: datafusion.user_defined.LogicalExtensionCodecExportable + +::: datafusion.user_defined.PhysicalExtensionCodecExportable diff --git a/docs/source/reference/index.md b/docs/source/reference/index.md new file mode 100644 index 000000000..97e5830a6 --- /dev/null +++ b/docs/source/reference/index.md @@ -0,0 +1,24 @@ + + +# API Reference + +The Python API of DataFusion is exported from the [`datafusion`](datafusion/index.md) +package. See the package landing page for an overview and a list of submodules, +or jump directly to any module from the navigation sidebar. diff --git a/docs/source/user-guide/ai-coding-assistants.md b/docs/source/user-guide/ai-coding-assistants.md new file mode 100644 index 000000000..2057ec410 --- /dev/null +++ b/docs/source/user-guide/ai-coding-assistants.md @@ -0,0 +1,81 @@ + + +# Using AI Coding Assistants + +If you write DataFusion Python code with an AI coding assistant, this +project ships machine-readable guidance so the assistant produces +idiomatic code rather than guessing from its training data. + +## What is published + +- [SKILL.md](https://github.com/apache/datafusion-python/blob/main/skills/datafusion_python/SKILL.md) — + a dense, skill-oriented reference covering imports, data loading, + DataFrame operations, expression building, SQL-to-DataFrame mappings, + idiomatic patterns, and common pitfalls. Follows the + [Agent Skills](https://agentskills.io) open standard. +- [llms.txt](https://datafusion.apache.org/python/llms.txt) — an entry point for LLM-based tools following the + [llmstxt.org](https://llmstxt.org) convention. Categorized links to the + skill, user guide, API reference, and examples. + +Both files live at stable URLs so an agent can discover them without a +checkout of the repo. + +## Installing the skill + +**Preferred:** run + +```shell +npx skills add apache/datafusion-python +``` + +This installs the skill in any supported agent on your machine (Claude +Code, Cursor, Windsurf, Cline, Codex, Copilot, Gemini CLI, and others). +The command writes the pointer into the agent's configuration so that any +project you open that uses DataFusion Python picks up the skill +automatically. + +**Manual:** if you are not using the `skills` registry, paste this +single line into your project's `AGENTS.md` or `CLAUDE.md`: + +``` +For DataFusion Python code, see https://github.com/apache/datafusion-python/blob/main/skills/datafusion_python/SKILL.md +``` + +Most assistants resolve that pointer the first time they see a +DataFusion-related prompt in the project. + +## What the skill covers + +Writing DataFusion Python code has a handful of conventions that are easy +for a model to miss — bitwise `&` / `|` / `~` instead of Python +`and` / `or` / `not`, the lazy-DataFrame immutability model, how +window functions replace SQL correlated subqueries, the `case` / +`when` builder syntax, and the `in_list` / `array_position` options +for membership tests. The skill enumerates each of these with short, +copyable examples. + +It is *not* a replacement for this user guide. Think of it as a distilled +reference the assistant keeps open while it writes code for you. + +## If you are an agent author + +The skill file and `llms.txt` are the two supported integration +points. Both are versioned along with the release and follow open +standards — no project-specific handshake is required. diff --git a/docs/source/user-guide/ai-coding-assistants.rst b/docs/source/user-guide/ai-coding-assistants.rst deleted file mode 100644 index fb7998c6d..000000000 --- a/docs/source/user-guide/ai-coding-assistants.rst +++ /dev/null @@ -1,82 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Using AI Coding Assistants -========================== - -If you write DataFusion Python code with an AI coding assistant, this -project ships machine-readable guidance so the assistant produces -idiomatic code rather than guessing from its training data. - -What is published ------------------ - -- `SKILL.md `_ — - a dense, skill-oriented reference covering imports, data loading, - DataFrame operations, expression building, SQL-to-DataFrame mappings, - idiomatic patterns, and common pitfalls. Follows the - `Agent Skills `_ open standard. -- `llms.txt `_ — an entry point for LLM-based tools following the - `llmstxt.org `_ convention. Categorized links to the - skill, user guide, API reference, and examples. - -Both files live at stable URLs so an agent can discover them without a -checkout of the repo. - -Installing the skill --------------------- - -**Preferred:** run - -.. code-block:: shell - - npx skills add apache/datafusion-python - -This installs the skill in any supported agent on your machine (Claude -Code, Cursor, Windsurf, Cline, Codex, Copilot, Gemini CLI, and others). -The command writes the pointer into the agent's configuration so that any -project you open that uses DataFusion Python picks up the skill -automatically. - -**Manual:** if you are not using the ``skills`` registry, paste this -single line into your project's ``AGENTS.md`` or ``CLAUDE.md``:: - - For DataFusion Python code, see https://github.com/apache/datafusion-python/blob/main/skills/datafusion_python/SKILL.md - -Most assistants resolve that pointer the first time they see a -DataFusion-related prompt in the project. - -What the skill covers ---------------------- - -Writing DataFusion Python code has a handful of conventions that are easy -for a model to miss — bitwise ``&`` / ``|`` / ``~`` instead of Python -``and`` / ``or`` / ``not``, the lazy-DataFrame immutability model, how -window functions replace SQL correlated subqueries, the ``case`` / -``when`` builder syntax, and the ``in_list`` / ``array_position`` options -for membership tests. The skill enumerates each of these with short, -copyable examples. - -It is *not* a replacement for this user guide. Think of it as a distilled -reference the assistant keeps open while it writes code for you. - -If you are an agent author --------------------------- - -The skill file and ``llms.txt`` are the two supported integration -points. Both are versioned along with the release and follow open -standards — no project-specific handshake is required. diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst deleted file mode 100644 index 7c6820461..000000000 --- a/docs/source/user-guide/basics.rst +++ /dev/null @@ -1,98 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _user_guide_concepts: - -Concepts -======== - -In this section, we will cover a basic example to introduce a few key concepts. We will use the -2021 Yellow Taxi Trip Records (`download `_), -from the `TLC Trip Record Data `_. - -.. ipython:: python - - from datafusion import SessionContext, col, lit, functions as f - - ctx = SessionContext() - - df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") - - df = df.select( - "trip_distance", - col("total_amount").alias("total"), - (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"), - ) - - df.show() - -Session Context ---------------- - -The first statement group creates a :py:class:`~datafusion.context.SessionContext`. - -.. code-block:: python - - # create a context - ctx = datafusion.SessionContext() - -A Session Context is the main interface for executing queries with DataFusion. It maintains the state -of the connection between a user and an instance of the DataFusion engine. Additionally it provides -the following functionality: - -- Create a DataFrame from a data source. -- Register a data source as a table that can be referenced from a SQL query. -- Execute a SQL query - -DataFrame ---------- - -The second statement group creates a :code:`DataFrame`, - -.. code-block:: python - - # Create a DataFrame from a file - df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") - -A DataFrame refers to a (logical) set of rows that share the same column names, similar to a `Pandas DataFrame `_. -DataFrames are typically created by calling a method on :py:class:`~datafusion.context.SessionContext`, such as :code:`read_csv`, and can then be modified by -calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`, -and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition. - -For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe/index`. - -Expressions ------------ - -The third statement uses :code:`Expressions` to build up a query definition. You can find -explanations for what the functions below do in the user documentation for -:py:func:`~datafusion.col`, :py:func:`~datafusion.lit`, :py:func:`~datafusion.functions.round`, -and :py:func:`~datafusion.expr.Expr.alias`. - -.. code-block:: python - - df = df.select( - "trip_distance", - col("total_amount").alias("total"), - (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"), - ) - -Finally the :py:func:`~datafusion.dataframe.DataFrame.show` method converts the logical plan -represented by the DataFrame into a physical plan and execute it, collecting all results and -displaying them to the user. It is important to note that DataFusion performs lazy evaluation -of the DataFrame. Until you call a method such as :py:func:`~datafusion.dataframe.DataFrame.show` -or :py:func:`~datafusion.dataframe.DataFrame.collect`, DataFusion will not perform the query. diff --git a/docs/source/user-guide/common-operations/aggregations.md b/docs/source/user-guide/common-operations/aggregations.md new file mode 100644 index 000000000..1d4bb3dee --- /dev/null +++ b/docs/source/user-guide/common-operations/aggregations.md @@ -0,0 +1,472 @@ + + + +# Aggregation + +An aggregate or aggregation is a function where the values of multiple rows are processed together +to form a single summary value. For performing an aggregation, DataFusion provides the +[`aggregate`][datafusion.dataframe.DataFrame.aggregate] + +```python exec="1" source="material-block" result="text" session="aggregations" +ctx = SessionContext() +df = ctx.read_csv("pokemon.csv") + +col_type_1 = col('"Type 1"') +col_type_2 = col('"Type 2"') +col_speed = col('"Speed"') +col_attack = col('"Attack"') + +df.aggregate( + [col_type_1], + [ + f.approx_distinct(col_speed).alias("Count"), + f.approx_median(col_speed).alias("Median Speed"), + f.approx_percentile_cont(col_speed, 0.9).alias("90% Speed"), + ], +).show() +``` + + +When `group_by` is `None` or an empty list, the aggregation is done over the whole +[`DataFrame`][datafusion.dataframe.DataFrame]. For grouping the `group_by` list must contain at least one column. + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [col_type_1], + [ + f.max(col_speed).alias("Max Speed"), + f.avg(col_speed).alias("Avg Speed"), + f.min(col_speed).alias("Min Speed"), + ], +).show() +``` + + +More than one column can be used for grouping + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [col_type_1, col_type_2], + [ + f.max(col_speed).alias("Max Speed"), + f.avg(col_speed).alias("Avg Speed"), + f.min(col_speed).alias("Min Speed"), + ], +).show() +``` + + +## Setting Parameters + +Each of the built in aggregate functions provides arguments for the parameters that affect their +operation. These can also be overridden using the builder approach to setting any of the following +parameters. When you use the builder, you must call `build()` to finish. For example, these two +expressions are equivalent. + +```python exec="1" source="material-block" session="aggregations" +first_1 = f.first_value(col("a"), order_by=[col("a")]) +first_2 = f.first_value(col("a")).order_by(col("a")).build() +``` + + +### Ordering + +You can control the order in which rows are processed by window functions by providing +a list of `order_by` functions for the `order_by` parameter. In the following example, we +sort the Pokemon by their attack in increasing order and take the first value, which gives us the +Pokemon with the smallest attack value in each `Type 1`. + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [col('"Type 1"')], + [ + f.first_value( + col('"Name"'), order_by=[col('"Attack"').sort(ascending=True)] + ).alias("Smallest Attack") + ], +).show() +``` + + +### Distinct + +When you set the parameter `distinct` to `True`, then unique values will only be evaluated one +time each. Suppose we want to create an array of all of the `Type 2` for each `Type 1` of our +Pokemon set. Since there will be many entries of `Type 2` we only one each distinct value. + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [col_type_1], [f.array_agg(col_type_2, distinct=True).alias("Type 2 List")] +).show() +``` + + +In the output of the above we can see that there are some `Type 1` for which the `Type 2` entry +is `null`. In reality, we probably want to filter those out. We can do this in two ways. First, +we can filter DataFrame rows that have no `Type 2`. If we do this, we might have some `Type 1` +entries entirely removed. The second is we can use the `filter` argument described below. + +```python exec="1" source="material-block" result="text" session="aggregations" +df.filter(col_type_2.is_not_null()).aggregate( + [col_type_1], [f.array_agg(col_type_2, distinct=True).alias("Type 2 List")] +) + +df.aggregate( + [col_type_1], + [ + f.array_agg(col_type_2, distinct=True, filter=col_type_2.is_not_null()).alias( + "Type 2 List" + ) + ], +).show() +``` + + +Which approach you take should depend on your use case. + +### Null Treatment + +This option allows you to either respect or ignore null values. + +One common usage for handling nulls is the case where you want to find the first value within a +partition. By setting the null treatment to ignore nulls, we can find the first non-null value +in our partition. + +```python exec="1" source="material-block" result="text" session="aggregations" +from datafusion.common import NullTreatment + +df.aggregate( + [col_type_1], + [ + f.first_value( + col_type_2, + order_by=[col_attack], + null_treatment=NullTreatment.RESPECT_NULLS, + ).alias("Lowest Attack Type 2") + ], +) + +df.aggregate( + [col_type_1], + [ + f.first_value( + col_type_2, order_by=[col_attack], null_treatment=NullTreatment.IGNORE_NULLS + ).alias("Lowest Attack Type 2") + ], +).show() +``` + + +### Filter + +Using the filter option is useful for filtering results to include in the aggregate function. It can +be seen in the example above on how this can be useful to only filter rows evaluated by the +aggregate function without filtering rows from the entire DataFrame. + +Filter takes a single expression. + +Suppose we want to find the speed values for only Pokemon that have low Attack values. + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [col_type_1], + [ + f.avg(col_speed).alias("Avg Speed All"), + f.avg(col_speed, filter=col_attack < lit(50)).alias("Avg Speed Low Attack"), + ], +).show() +``` + + +### Comparing subsets within a group + +Sometimes you need to compare the full membership of a group against a +subset that meets some condition — for example, "which groups have at least +one failure, but not every member failed?". The `filter` argument on an +aggregate restricts the rows that contribute to *that* aggregate without +dropping the group, so a single pass can produce both the full set and the +filtered subset side by side. Pairing +[`array_agg`][datafusion.functions.array_agg] with `distinct=True` and +`filter=` is a compact way to express this: collect the distinct values +of the group, collect the distinct values that satisfy the condition, then +compare the two arrays. + +Suppose each row records a line item with the supplier that fulfilled it and +a flag for whether that supplier met the commit date. We want to identify +*partially failed* orders — orders where at least one supplier failed but +not every supplier failed: + +```python exec="1" source="material-block" result="text" session="aggregations" +orders_df = ctx.from_pydict( + { + "order_id": [1, 1, 1, 2, 2, 3, 4, 4], + "supplier_id": [100, 101, 102, 200, 201, 300, 400, 401], + "failed": [False, True, False, False, False, True, True, True], + }, +) + +grouped = orders_df.aggregate( + [col("order_id")], + [ + f.array_agg(col("supplier_id"), distinct=True).alias("all_suppliers"), + f.array_agg( + col("supplier_id"), + filter=col("failed"), + distinct=True, + ).alias("failed_suppliers"), + ], +) + +print(grouped.filter( + (f.array_length(col("failed_suppliers")) > lit(0)) + & (f.array_length(col("failed_suppliers")) < f.array_length(col("all_suppliers"))) +).select(col("order_id"), col("failed_suppliers"))) +``` + + +Order 1 is partial (one of three suppliers failed). Order 2 is excluded +because no supplier failed, order 3 because its only supplier failed, and +order 4 because both of its suppliers failed. + +## Grouping Sets + +The default style of aggregation produces one row per group. Sometimes you want a single query to +produce rows at multiple levels of detail — for example, totals per type *and* an overall grand +total, or subtotals for every combination of two columns plus the individual column totals. Writing +separate queries and concatenating them is tedious and runs the data multiple times. Grouping sets +solve this by letting you specify several grouping levels in one pass. + +DataFusion supports three grouping set styles through the +[`GroupingSet`][datafusion.expr.GroupingSet] class: + +- [`rollup`][datafusion.expr.GroupingSet.rollup] — hierarchical subtotals, like a drill-down report +- [`cube`][datafusion.expr.GroupingSet.cube] — every possible subtotal combination, like a pivot table +- [`grouping_sets`][datafusion.expr.GroupingSet.grouping_sets] — explicitly list exactly which grouping levels you want + +Because result rows come from different grouping levels, a column that is *not* part of a +particular level will be `null` in that row. Use [`grouping`][datafusion.functions.grouping] to +distinguish a real `null` in the data from one that means "this column was aggregated across." +It returns `0` when the column is a grouping key for that row, and `1` when it is not. + +### Rollup + +[`rollup`][datafusion.expr.GroupingSet.rollup] creates a hierarchy. `rollup(a, b)` produces +grouping sets `(a, b)`, `(a)`, and `()` — like nested subtotals in a report. This is useful +when your columns have a natural hierarchy, such as region → city or type → subtype. + +Suppose we want to summarize Pokemon stats by `Type 1` with subtotals and a grand total. With +the default aggregation style we would need two separate queries. With `rollup` we get it all at +once: + +```python exec="1" source="material-block" result="text" session="aggregations" +from datafusion.expr import GroupingSet + +df.aggregate( + [GroupingSet.rollup(col_type_1)], + [ + f.count(col_speed).alias("Count"), + f.avg(col_speed).alias("Avg Speed"), + f.max(col_speed).alias("Max Speed"), + ], +).sort(col_type_1.sort(ascending=True, nulls_first=True)).show() +``` + + +The first row — where `Type 1` is `null` — is the grand total across all types. But how do you +tell a grand-total `null` apart from a Pokemon that genuinely has no type? The +[`grouping`][datafusion.functions.grouping] function returns `0` when the column is a grouping key +for that row and `1` when it is aggregated across. + +Use `.alias()` to give the column a readable name: + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [GroupingSet.rollup(col_type_1)], + [ + f.count(col_speed).alias("Count"), + f.avg(col_speed).alias("Avg Speed"), + f.grouping(col_type_1).alias("Is Total"), + ], +).sort(col_type_1.sort(ascending=True, nulls_first=True)).show() +``` + + +With two columns the hierarchy becomes more apparent. `rollup(Type 1, Type 2)` produces: + +- one row per `(Type 1, Type 2)` pair — the most detailed level +- one row per `Type 1` — subtotals +- one grand total row + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [GroupingSet.rollup(col_type_1, col_type_2)], + [f.count(col_speed).alias("Count"), f.avg(col_speed).alias("Avg Speed")], +).sort( + col_type_1.sort(ascending=True, nulls_first=True), + col_type_2.sort(ascending=True, nulls_first=True), +).show() +``` + + +### Cube + +[`cube`][datafusion.expr.GroupingSet.cube] produces every possible subset. `cube(a, b)` +produces grouping sets `(a, b)`, `(a)`, `(b)`, and `()` — one more than `rollup` because +it also includes `(b)` alone. This is useful when neither column is "above" the other in a +hierarchy and you want all cross-tabulations. + +For our Pokemon data, `cube(Type 1, Type 2)` gives us stats broken down by the type pair, +by `Type 1` alone, by `Type 2` alone, and a grand total — all in one query: + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [GroupingSet.cube(col_type_1, col_type_2)], + [f.count(col_speed).alias("Count"), f.avg(col_speed).alias("Avg Speed")], +).sort( + col_type_1.sort(ascending=True, nulls_first=True), + col_type_2.sort(ascending=True, nulls_first=True), +).show() +``` + + +Compared to the `rollup` example above, notice the extra rows where `Type 1` is `null` but +`Type 2` has a value — those are the per-`Type 2` subtotals that `rollup` does not include. + +### Explicit Grouping Sets + +[`grouping_sets`][datafusion.expr.GroupingSet.grouping_sets] lets you list exactly which grouping levels +you need when `rollup` or `cube` would produce too many or too few. Each argument is a list of +columns forming one grouping set. + +For example, if we want only the per-`Type 1` totals and per-`Type 2` totals — but *not* the +full `(Type 1, Type 2)` detail rows or the grand total — we can ask for exactly that: + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [GroupingSet.grouping_sets([col_type_1], [col_type_2])], + [f.count(col_speed).alias("Count"), f.avg(col_speed).alias("Avg Speed")], +).sort( + col_type_1.sort(ascending=True, nulls_first=True), + col_type_2.sort(ascending=True, nulls_first=True), +).show() +``` + + +Each row belongs to exactly one grouping level. The [`grouping`][datafusion.functions.grouping] +function tells you which level each row comes from: + +```python exec="1" source="material-block" result="text" session="aggregations" +df.aggregate( + [GroupingSet.grouping_sets([col_type_1], [col_type_2])], + [ + f.count(col_speed).alias("Count"), + f.avg(col_speed).alias("Avg Speed"), + f.grouping(col_type_1).alias("grouping(Type 1)"), + f.grouping(col_type_2).alias("grouping(Type 2)"), + ], +).sort( + col_type_1.sort(ascending=True, nulls_first=True), + col_type_2.sort(ascending=True, nulls_first=True), +).show() +``` + + +Where `grouping(Type 1)` is `0` the row is a per-`Type 1` total (and `Type 2` is `null`). +Where `grouping(Type 2)` is `0` the row is a per-`Type 2` total (and `Type 1` is `null`). + +## Aggregate Functions + +The available aggregate functions are: + +01. Comparison Functions + : - [`min`][datafusion.functions.min] + - [`max`][datafusion.functions.max] +02. Math Functions + : - [`sum`][datafusion.functions.sum] + - [`avg`][datafusion.functions.avg] + - [`median`][datafusion.functions.median] +03. Array Functions + : - [`array_agg`][datafusion.functions.array_agg] +04. Logical Functions + : - [`bit_and`][datafusion.functions.bit_and] + - [`bit_or`][datafusion.functions.bit_or] + - [`bit_xor`][datafusion.functions.bit_xor] + - [`bool_and`][datafusion.functions.bool_and] + - [`bool_or`][datafusion.functions.bool_or] +05. Statistical Functions + : - [`count`][datafusion.functions.count] + - [`corr`][datafusion.functions.corr] + - [`covar_samp`][datafusion.functions.covar_samp] + - [`covar_pop`][datafusion.functions.covar_pop] + - [`stddev`][datafusion.functions.stddev] + - [`stddev_pop`][datafusion.functions.stddev_pop] + - [`var_samp`][datafusion.functions.var_samp] + - [`var_pop`][datafusion.functions.var_pop] + - [`var_population`][datafusion.functions.var_population] +06. Linear Regression Functions + : - [`regr_count`][datafusion.functions.regr_count] + - [`regr_slope`][datafusion.functions.regr_slope] + - [`regr_intercept`][datafusion.functions.regr_intercept] + - [`regr_r2`][datafusion.functions.regr_r2] + - [`regr_avgx`][datafusion.functions.regr_avgx] + - [`regr_avgy`][datafusion.functions.regr_avgy] + - [`regr_sxx`][datafusion.functions.regr_sxx] + - [`regr_syy`][datafusion.functions.regr_syy] + - [`regr_slope`][datafusion.functions.regr_slope] +07. Positional Functions + : - [`first_value`][datafusion.functions.first_value] + - [`last_value`][datafusion.functions.last_value] + - [`nth_value`][datafusion.functions.nth_value] +08. String Functions + : - [`string_agg`][datafusion.functions.string_agg] +09. Percentile Functions + : - [`percentile_cont`][datafusion.functions.percentile_cont] + - [`quantile_cont`][datafusion.functions.quantile_cont] + - [`approx_distinct`][datafusion.functions.approx_distinct] + - [`approx_median`][datafusion.functions.approx_median] + - [`approx_percentile_cont`][datafusion.functions.approx_percentile_cont] + - [`approx_percentile_cont_with_weight`][datafusion.functions.approx_percentile_cont_with_weight] +10. Grouping Set Functions + \- [`grouping`][datafusion.functions.grouping] + \- [`rollup`][datafusion.expr.GroupingSet.rollup] + \- [`cube`][datafusion.expr.GroupingSet.cube] + \- [`grouping_sets`][datafusion.expr.GroupingSet.grouping_sets] + +## User-Defined Aggregate Functions + +You can ship custom aggregations to the engine by subclassing +[`Accumulator`][datafusion.user_defined.Accumulator] and registering it via +[`udaf`][datafusion.user_defined.udaf]. See [`user_defined`](../../reference/datafusion/user_defined.md) +for the accumulator interface and worked examples. + +
+

Note

+ +Serialization + +
+ + Python aggregate UDFs travel inline inside pickled or + [`to_bytes`][datafusion.expr.Expr.to_bytes]-serialized expressions — + the accumulator class is captured by value via [`cloudpickle`][cloudpickle], + so worker processes do not need to pre-register the UDF. Any names + the accumulator resolves via `import` are captured **by reference** + and must be importable on the receiving worker. See + [`ipc`][datafusion.ipc] for the full IPC model and security caveats. diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.rst deleted file mode 100644 index b1e43a32f..000000000 --- a/docs/source/user-guide/common-operations/aggregations.rst +++ /dev/null @@ -1,454 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _aggregation: - -Aggregation -============ - -An aggregate or aggregation is a function where the values of multiple rows are processed together -to form a single summary value. For performing an aggregation, DataFusion provides the -:py:func:`~datafusion.dataframe.DataFrame.aggregate` - -.. ipython:: python - - from datafusion import SessionContext, col, lit, functions as f - - ctx = SessionContext() - df = ctx.read_csv("pokemon.csv") - - col_type_1 = col('"Type 1"') - col_type_2 = col('"Type 2"') - col_speed = col('"Speed"') - col_attack = col('"Attack"') - - df.aggregate([col_type_1], [ - f.approx_distinct(col_speed).alias("Count"), - f.approx_median(col_speed).alias("Median Speed"), - f.approx_percentile_cont(col_speed, 0.9).alias("90% Speed")]) - -When :code:`group_by` is :code:`None` or an empty list, the aggregation is done over the whole -:class:`.DataFrame`. For grouping the :code:`group_by` list must contain at least one column. - -.. ipython:: python - - df.aggregate([col_type_1], [ - f.max(col_speed).alias("Max Speed"), - f.avg(col_speed).alias("Avg Speed"), - f.min(col_speed).alias("Min Speed")]) - -More than one column can be used for grouping - -.. ipython:: python - - df.aggregate([col_type_1, col_type_2], [ - f.max(col_speed).alias("Max Speed"), - f.avg(col_speed).alias("Avg Speed"), - f.min(col_speed).alias("Min Speed")]) - - - -Setting Parameters ------------------- - -Each of the built in aggregate functions provides arguments for the parameters that affect their -operation. These can also be overridden using the builder approach to setting any of the following -parameters. When you use the builder, you must call ``build()`` to finish. For example, these two -expressions are equivalent. - -.. ipython:: python - - first_1 = f.first_value(col("a"), order_by=[col("a")]) - first_2 = f.first_value(col("a")).order_by(col("a")).build() - -Ordering -^^^^^^^^ - -You can control the order in which rows are processed by window functions by providing -a list of ``order_by`` functions for the ``order_by`` parameter. In the following example, we -sort the Pokemon by their attack in increasing order and take the first value, which gives us the -Pokemon with the smallest attack value in each ``Type 1``. - -.. ipython:: python - - df.aggregate( - [col('"Type 1"')], - [f.first_value( - col('"Name"'), - order_by=[col('"Attack"').sort(ascending=True)] - ).alias("Smallest Attack") - ]) - -Distinct -^^^^^^^^ - -When you set the parameter ``distinct`` to ``True``, then unique values will only be evaluated one -time each. Suppose we want to create an array of all of the ``Type 2`` for each ``Type 1`` of our -Pokemon set. Since there will be many entries of ``Type 2`` we only one each distinct value. - -.. ipython:: python - - df.aggregate([col_type_1], [f.array_agg(col_type_2, distinct=True).alias("Type 2 List")]) - -In the output of the above we can see that there are some ``Type 1`` for which the ``Type 2`` entry -is ``null``. In reality, we probably want to filter those out. We can do this in two ways. First, -we can filter DataFrame rows that have no ``Type 2``. If we do this, we might have some ``Type 1`` -entries entirely removed. The second is we can use the ``filter`` argument described below. - -.. ipython:: python - - df.filter(col_type_2.is_not_null()).aggregate([col_type_1], [f.array_agg(col_type_2, distinct=True).alias("Type 2 List")]) - - df.aggregate([col_type_1], [f.array_agg(col_type_2, distinct=True, filter=col_type_2.is_not_null()).alias("Type 2 List")]) - -Which approach you take should depend on your use case. - -Null Treatment -^^^^^^^^^^^^^^ - -This option allows you to either respect or ignore null values. - -One common usage for handling nulls is the case where you want to find the first value within a -partition. By setting the null treatment to ignore nulls, we can find the first non-null value -in our partition. - - -.. ipython:: python - - from datafusion.common import NullTreatment - - df.aggregate([col_type_1], [ - f.first_value( - col_type_2, - order_by=[col_attack], - null_treatment=NullTreatment.RESPECT_NULLS - ).alias("Lowest Attack Type 2")]) - - df.aggregate([col_type_1], [ - f.first_value( - col_type_2, - order_by=[col_attack], - null_treatment=NullTreatment.IGNORE_NULLS - ).alias("Lowest Attack Type 2")]) - -Filter -^^^^^^ - -Using the filter option is useful for filtering results to include in the aggregate function. It can -be seen in the example above on how this can be useful to only filter rows evaluated by the -aggregate function without filtering rows from the entire DataFrame. - -Filter takes a single expression. - -Suppose we want to find the speed values for only Pokemon that have low Attack values. - -.. ipython:: python - - df.aggregate([col_type_1], [ - f.avg(col_speed).alias("Avg Speed All"), - f.avg(col_speed, filter=col_attack < lit(50)).alias("Avg Speed Low Attack")]) - - -Comparing subsets within a group -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Sometimes you need to compare the full membership of a group against a -subset that meets some condition — for example, "which groups have at least -one failure, but not every member failed?". The ``filter`` argument on an -aggregate restricts the rows that contribute to *that* aggregate without -dropping the group, so a single pass can produce both the full set and the -filtered subset side by side. Pairing -:py:func:`~datafusion.functions.array_agg` with ``distinct=True`` and -``filter=`` is a compact way to express this: collect the distinct values -of the group, collect the distinct values that satisfy the condition, then -compare the two arrays. - -Suppose each row records a line item with the supplier that fulfilled it and -a flag for whether that supplier met the commit date. We want to identify -*partially failed* orders — orders where at least one supplier failed but -not every supplier failed: - -.. ipython:: python - - orders_df = ctx.from_pydict( - { - "order_id": [1, 1, 1, 2, 2, 3, 4, 4], - "supplier_id": [100, 101, 102, 200, 201, 300, 400, 401], - "failed": [False, True, False, False, False, True, True, True], - }, - ) - - grouped = orders_df.aggregate( - [col("order_id")], - [ - f.array_agg(col("supplier_id"), distinct=True).alias("all_suppliers"), - f.array_agg( - col("supplier_id"), - filter=col("failed"), - distinct=True, - ).alias("failed_suppliers"), - ], - ) - - grouped.filter( - (f.array_length(col("failed_suppliers")) > lit(0)) - & (f.array_length(col("failed_suppliers")) < f.array_length(col("all_suppliers"))) - ).select(col("order_id"), col("failed_suppliers")) - -Order 1 is partial (one of three suppliers failed). Order 2 is excluded -because no supplier failed, order 3 because its only supplier failed, and -order 4 because both of its suppliers failed. - -Grouping Sets -------------- - -The default style of aggregation produces one row per group. Sometimes you want a single query to -produce rows at multiple levels of detail — for example, totals per type *and* an overall grand -total, or subtotals for every combination of two columns plus the individual column totals. Writing -separate queries and concatenating them is tedious and runs the data multiple times. Grouping sets -solve this by letting you specify several grouping levels in one pass. - -DataFusion supports three grouping set styles through the -:py:class:`~datafusion.expr.GroupingSet` class: - -- :py:meth:`~datafusion.expr.GroupingSet.rollup` — hierarchical subtotals, like a drill-down report -- :py:meth:`~datafusion.expr.GroupingSet.cube` — every possible subtotal combination, like a pivot table -- :py:meth:`~datafusion.expr.GroupingSet.grouping_sets` — explicitly list exactly which grouping levels you want - -Because result rows come from different grouping levels, a column that is *not* part of a -particular level will be ``null`` in that row. Use :py:func:`~datafusion.functions.grouping` to -distinguish a real ``null`` in the data from one that means "this column was aggregated across." -It returns ``0`` when the column is a grouping key for that row, and ``1`` when it is not. - -Rollup -^^^^^^ - -:py:meth:`~datafusion.expr.GroupingSet.rollup` creates a hierarchy. ``rollup(a, b)`` produces -grouping sets ``(a, b)``, ``(a)``, and ``()`` — like nested subtotals in a report. This is useful -when your columns have a natural hierarchy, such as region → city or type → subtype. - -Suppose we want to summarize Pokemon stats by ``Type 1`` with subtotals and a grand total. With -the default aggregation style we would need two separate queries. With ``rollup`` we get it all at -once: - -.. ipython:: python - - from datafusion.expr import GroupingSet - - df.aggregate( - [GroupingSet.rollup(col_type_1)], - [f.count(col_speed).alias("Count"), - f.avg(col_speed).alias("Avg Speed"), - f.max(col_speed).alias("Max Speed")] - ).sort(col_type_1.sort(ascending=True, nulls_first=True)) - -The first row — where ``Type 1`` is ``null`` — is the grand total across all types. But how do you -tell a grand-total ``null`` apart from a Pokemon that genuinely has no type? The -:py:func:`~datafusion.functions.grouping` function returns ``0`` when the column is a grouping key -for that row and ``1`` when it is aggregated across. - -.. note:: - - Due to an upstream DataFusion limitation - (`apache/datafusion#21411 `_), - ``.alias()`` cannot be applied directly to a ``grouping()`` expression — it will raise an - error at execution time. Instead, use - :py:meth:`~datafusion.dataframe.DataFrame.with_column_renamed` on the result DataFrame to - give the column a readable name. Once the upstream issue is resolved, you will be able to - use ``.alias()`` directly and the workaround below will no longer be necessary. - -The raw column name generated by ``grouping()`` contains internal identifiers, so we use -:py:meth:`~datafusion.dataframe.DataFrame.with_column_renamed` to clean it up: - -.. ipython:: python - - result = df.aggregate( - [GroupingSet.rollup(col_type_1)], - [f.count(col_speed).alias("Count"), - f.avg(col_speed).alias("Avg Speed"), - f.grouping(col_type_1)] - ) - for field in result.schema(): - if field.name.startswith("grouping("): - result = result.with_column_renamed(field.name, "Is Total") - result.sort(col_type_1.sort(ascending=True, nulls_first=True)) - -With two columns the hierarchy becomes more apparent. ``rollup(Type 1, Type 2)`` produces: - -- one row per ``(Type 1, Type 2)`` pair — the most detailed level -- one row per ``Type 1`` — subtotals -- one grand total row - -.. ipython:: python - - df.aggregate( - [GroupingSet.rollup(col_type_1, col_type_2)], - [f.count(col_speed).alias("Count"), - f.avg(col_speed).alias("Avg Speed")] - ).sort( - col_type_1.sort(ascending=True, nulls_first=True), - col_type_2.sort(ascending=True, nulls_first=True) - ) - -Cube -^^^^ - -:py:meth:`~datafusion.expr.GroupingSet.cube` produces every possible subset. ``cube(a, b)`` -produces grouping sets ``(a, b)``, ``(a)``, ``(b)``, and ``()`` — one more than ``rollup`` because -it also includes ``(b)`` alone. This is useful when neither column is "above" the other in a -hierarchy and you want all cross-tabulations. - -For our Pokemon data, ``cube(Type 1, Type 2)`` gives us stats broken down by the type pair, -by ``Type 1`` alone, by ``Type 2`` alone, and a grand total — all in one query: - -.. ipython:: python - - df.aggregate( - [GroupingSet.cube(col_type_1, col_type_2)], - [f.count(col_speed).alias("Count"), - f.avg(col_speed).alias("Avg Speed")] - ).sort( - col_type_1.sort(ascending=True, nulls_first=True), - col_type_2.sort(ascending=True, nulls_first=True) - ) - -Compared to the ``rollup`` example above, notice the extra rows where ``Type 1`` is ``null`` but -``Type 2`` has a value — those are the per-``Type 2`` subtotals that ``rollup`` does not include. - -Explicit Grouping Sets -^^^^^^^^^^^^^^^^^^^^^^ - -:py:meth:`~datafusion.expr.GroupingSet.grouping_sets` lets you list exactly which grouping levels -you need when ``rollup`` or ``cube`` would produce too many or too few. Each argument is a list of -columns forming one grouping set. - -For example, if we want only the per-``Type 1`` totals and per-``Type 2`` totals — but *not* the -full ``(Type 1, Type 2)`` detail rows or the grand total — we can ask for exactly that: - -.. ipython:: python - - df.aggregate( - [GroupingSet.grouping_sets([col_type_1], [col_type_2])], - [f.count(col_speed).alias("Count"), - f.avg(col_speed).alias("Avg Speed")] - ).sort( - col_type_1.sort(ascending=True, nulls_first=True), - col_type_2.sort(ascending=True, nulls_first=True) - ) - -Each row belongs to exactly one grouping level. The :py:func:`~datafusion.functions.grouping` -function tells you which level each row comes from: - -.. ipython:: python - - result = df.aggregate( - [GroupingSet.grouping_sets([col_type_1], [col_type_2])], - [f.count(col_speed).alias("Count"), - f.avg(col_speed).alias("Avg Speed"), - f.grouping(col_type_1), - f.grouping(col_type_2)] - ) - for field in result.schema(): - if field.name.startswith("grouping("): - clean = field.name.split(".")[-1].rstrip(")") - result = result.with_column_renamed(field.name, f"grouping({clean})") - result.sort( - col_type_1.sort(ascending=True, nulls_first=True), - col_type_2.sort(ascending=True, nulls_first=True) - ) - -Where ``grouping(Type 1)`` is ``0`` the row is a per-``Type 1`` total (and ``Type 2`` is ``null``). -Where ``grouping(Type 2)`` is ``0`` the row is a per-``Type 2`` total (and ``Type 1`` is ``null``). - - -Aggregate Functions -------------------- - -The available aggregate functions are: - -1. Comparison Functions - - :py:func:`datafusion.functions.min` - - :py:func:`datafusion.functions.max` -2. Math Functions - - :py:func:`datafusion.functions.sum` - - :py:func:`datafusion.functions.avg` - - :py:func:`datafusion.functions.median` -3. Array Functions - - :py:func:`datafusion.functions.array_agg` -4. Logical Functions - - :py:func:`datafusion.functions.bit_and` - - :py:func:`datafusion.functions.bit_or` - - :py:func:`datafusion.functions.bit_xor` - - :py:func:`datafusion.functions.bool_and` - - :py:func:`datafusion.functions.bool_or` -5. Statistical Functions - - :py:func:`datafusion.functions.count` - - :py:func:`datafusion.functions.corr` - - :py:func:`datafusion.functions.covar_samp` - - :py:func:`datafusion.functions.covar_pop` - - :py:func:`datafusion.functions.stddev` - - :py:func:`datafusion.functions.stddev_pop` - - :py:func:`datafusion.functions.var_samp` - - :py:func:`datafusion.functions.var_pop` - - :py:func:`datafusion.functions.var_population` -6. Linear Regression Functions - - :py:func:`datafusion.functions.regr_count` - - :py:func:`datafusion.functions.regr_slope` - - :py:func:`datafusion.functions.regr_intercept` - - :py:func:`datafusion.functions.regr_r2` - - :py:func:`datafusion.functions.regr_avgx` - - :py:func:`datafusion.functions.regr_avgy` - - :py:func:`datafusion.functions.regr_sxx` - - :py:func:`datafusion.functions.regr_syy` - - :py:func:`datafusion.functions.regr_slope` -7. Positional Functions - - :py:func:`datafusion.functions.first_value` - - :py:func:`datafusion.functions.last_value` - - :py:func:`datafusion.functions.nth_value` -8. String Functions - - :py:func:`datafusion.functions.string_agg` -9. Percentile Functions - - :py:func:`datafusion.functions.percentile_cont` - - :py:func:`datafusion.functions.quantile_cont` - - :py:func:`datafusion.functions.approx_distinct` - - :py:func:`datafusion.functions.approx_median` - - :py:func:`datafusion.functions.approx_percentile_cont` - - :py:func:`datafusion.functions.approx_percentile_cont_with_weight` -10. Grouping Set Functions - - :py:func:`datafusion.functions.grouping` - - :py:meth:`datafusion.expr.GroupingSet.rollup` - - :py:meth:`datafusion.expr.GroupingSet.cube` - - :py:meth:`datafusion.expr.GroupingSet.grouping_sets` - -User-Defined Aggregate Functions --------------------------------- - -You can ship custom aggregations to the engine by subclassing -:py:class:`~datafusion.user_defined.Accumulator` and registering it via -:py:func:`~datafusion.udaf`. See :py:mod:`datafusion.user_defined` for -the accumulator interface and worked examples. - -.. note:: Serialization - - Python aggregate UDFs travel inline inside pickled or - :py:meth:`~datafusion.expr.Expr.to_bytes`-serialized expressions — - the accumulator class is captured by value via :mod:`cloudpickle`, - so worker processes do not need to pre-register the UDF. Any names - the accumulator resolves via ``import`` are captured **by reference** - and must be importable on the receiving worker. See - :py:mod:`datafusion.ipc` for the full IPC model and security caveats. - diff --git a/docs/source/user-guide/common-operations/basic-info.md b/docs/source/user-guide/common-operations/basic-info.md new file mode 100644 index 000000000..967d44500 --- /dev/null +++ b/docs/source/user-guide/common-operations/basic-info.md @@ -0,0 +1,66 @@ + + +# Basic Operations + +In this section, you will learn how to display essential details of DataFrames using specific functions. + +```python exec="1" source="material-block" result="text" session="basic-info" +import random + +ctx = SessionContext() +df = ctx.from_pydict( + { + "nrs": [1, 2, 3, 4, 5], + "names": ["python", "ruby", "java", "haskell", "go"], + "random": random.sample(range(1000), 5), + "groups": ["A", "A", "B", "C", "B"], + } +) +print(df) +``` + + +Use [`limit`][datafusion.dataframe.DataFrame.limit] to view the top rows of the frame: + +```python exec="1" source="material-block" result="text" session="basic-info" +df.limit(2).show() +``` + + +Display the columns of the DataFrame using [`schema`][datafusion.dataframe.DataFrame.schema]: + +```python exec="1" source="material-block" result="text" session="basic-info" +print(df.schema()) +``` + + +The method [`to_pandas`][datafusion.dataframe.DataFrame.to_pandas] uses pyarrow to convert to pandas DataFrame, by collecting the batches, +passing them to an Arrow table, and then converting them to a pandas DataFrame. + +```python exec="1" source="material-block" result="text" session="basic-info" +print(df.to_pandas()) +``` + + +[`describe`][datafusion.dataframe.DataFrame.describe] shows a quick statistic summary of your data: + +```python exec="1" source="material-block" result="text" session="basic-info" +df.describe().show() +``` diff --git a/docs/source/user-guide/common-operations/basic-info.rst b/docs/source/user-guide/common-operations/basic-info.rst deleted file mode 100644 index d48b49d5c..000000000 --- a/docs/source/user-guide/common-operations/basic-info.rst +++ /dev/null @@ -1,61 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Basic Operations -================ - -In this section, you will learn how to display essential details of DataFrames using specific functions. - -.. ipython:: python - - from datafusion import SessionContext - import random - - ctx = SessionContext() - df = ctx.from_pydict({ - "nrs": [1, 2, 3, 4, 5], - "names": ["python", "ruby", "java", "haskell", "go"], - "random": random.sample(range(1000), 5), - "groups": ["A", "A", "B", "C", "B"], - }) - df - -Use :py:func:`~datafusion.dataframe.DataFrame.limit` to view the top rows of the frame: - -.. ipython:: python - - df.limit(2) - -Display the columns of the DataFrame using :py:func:`~datafusion.dataframe.DataFrame.schema`: - -.. ipython:: python - - df.schema() - -The method :py:func:`~datafusion.dataframe.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches, -passing them to an Arrow table, and then converting them to a pandas DataFrame. - -.. ipython:: python - - df.to_pandas() - -:py:func:`~datafusion.dataframe.DataFrame.describe` shows a quick statistic summary of your data: - -.. ipython:: python - - df.describe() - diff --git a/docs/source/user-guide/common-operations/expressions.md b/docs/source/user-guide/common-operations/expressions.md new file mode 100644 index 000000000..79f6cc569 --- /dev/null +++ b/docs/source/user-guide/common-operations/expressions.md @@ -0,0 +1,340 @@ + + + +# Expressions + +In DataFusion an expression is an abstraction that represents a computation. +Expressions are used as the primary inputs and outputs for most functions within +DataFusion. As such, expressions can be combined to create expression trees, a +concept shared across most compilers and databases. + +## Column + +The first expression most new users will interact with is the Column, which is created by calling [`col`][datafusion.col.col]. +This expression represents a column within a DataFrame. The function [`col`][datafusion.col.col] takes as in input a string +and returns an expression as it's output. + +## Literal + +Literal expressions represent a single value. These are helpful in a wide range of operations where +a specific, known value is of interest. You can create a literal expression using the function [`lit`][datafusion.lit]. +The type of the object passed to the [`lit`][datafusion.lit] function will be used to convert it to a known data type. + +In the following example we create expressions for the column named `color` and the literal scalar string `red`. +The resultant variable `red_units` is itself also an expression. + +```python exec="1" source="material-block" session="expressions" +red_units = col("color") == lit("red") +``` + + +## Boolean + +When combining expressions that evaluate to a boolean value, you can combine these expressions using boolean operators. +It is important to note that in order to combine these expressions, you *must* use bitwise operators. See the following +examples for the and, or, and not operations. + +```python exec="1" source="material-block" session="expressions" +red_or_green_units = (col("color") == lit("red")) | (col("color") == lit("green")) +heavy_red_units = (col("color") == lit("red")) & (col("weight") > lit(42)) +not_red_units = ~(col("color") == lit("red")) +``` + + +## Arrays + +For columns that contain arrays of values, you can access individual elements of the array by index +using bracket indexing. This is similar to calling the function +[`array_element`][datafusion.functions.array_element], except that array indexing using brackets is 0 based, +similar to Python arrays and `array_element` is 1 based indexing to be compatible with other SQL +approaches. + +```python exec="1" source="material-block" result="text" session="expressions" +from datafusion import col + +ctx = SessionContext() +df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5, 6]]}) +df.select(col("a")[0].alias("a0")).show() +``` + + +
+

Warning

+ +Indexing an element of an array via `[]` starts at index 0 whereas +[`array_element`][datafusion.functions.array_element] starts at index 1. + +
+ +Starting in DataFusion 49.0.0 you can also create slices of array elements using +slice syntax from Python. + +```python exec="1" source="material-block" result="text" session="expressions" +df.select(col("a")[1:3].alias("second_two_elements")).show() +``` + + +To check if an array is empty, you can use the function [`array_empty`][datafusion.functions.array_empty] or `datafusion.functions.empty`. +This function returns a boolean indicating whether the array is empty. + +```python exec="1" source="material-block" result="text" session="expressions" +from datafusion import SessionContext, col +from datafusion.functions import array_empty + +ctx = SessionContext() +df = ctx.from_pydict({"a": [[], [1, 2, 3]]}) +df.select(array_empty(col("a")).alias("is_empty")).show() +``` + + +In this example, the `is_empty` column will contain `True` for the first row and `False` for the second row. + +To get the total number of elements in an array, you can use the function [`cardinality`][datafusion.functions.cardinality]. +This function returns an integer indicating the total number of elements in the array. + +```python exec="1" source="material-block" result="text" session="expressions" +from datafusion import SessionContext, col +from datafusion.functions import cardinality + +ctx = SessionContext() +df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5, 6]]}) +df.select(cardinality(col("a")).alias("num_elements")).show() +``` + + +In this example, the `num_elements` column will contain `3` for both rows. + +To concatenate two arrays, you can use the function [`array_cat`][datafusion.functions.array_cat] or [`array_concat`][datafusion.functions.array_concat]. +These functions return a new array that is the concatenation of the input arrays. + +```python exec="1" source="material-block" result="text" session="expressions" +from datafusion import SessionContext, col +from datafusion.functions import array_cat + +ctx = SessionContext() +df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[4, 5, 6]]}) +df.select(array_cat(col("a"), col("b")).alias("concatenated_array")).show() +``` + + +In this example, the `concatenated_array` column will contain `[1, 2, 3, 4, 5, 6]`. + +To repeat the elements of an array a specified number of times, you can use the function [`array_repeat`][datafusion.functions.array_repeat]. +This function returns a new array with the elements repeated. + +```python exec="1" source="material-block" result="text" session="expressions" +from datafusion import SessionContext, col +from datafusion.functions import array_repeat + +ctx = SessionContext() +df = ctx.from_pydict({"a": [[1, 2, 3]]}) +df.select(array_repeat(col("a"), literal(2)).alias("repeated_array")).show() +``` + + +In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`. + +## Lambda functions + +Some array functions take a *lambda function*: a small function that runs once +per element. [`array_transform`][datafusion.functions.array_transform] maps a lambda over +every element, [`array_filter`][datafusion.functions.array_filter] keeps the elements +for which a predicate lambda is true, and +[`array_any_match`][datafusion.functions.array_any_match] returns whether any element +satisfies a predicate lambda. (Functions that take another function as an +argument are sometimes called *higher-order* functions.) + +The simplest way to supply a lambda is a Python `lambda`. Its parameter names +become the lambda parameters, and its return value becomes the body. + +```python exec="1" source="material-block" result="text" session="expressions" +from datafusion import SessionContext, col + +ctx = SessionContext() +df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5]]}) +df.select(f.array_transform(col("a"), lambda v: v * 2).alias("doubled")) +df.select(f.array_filter(col("a"), lambda v: v > 2).alias("big_only")) +df.select(f.array_any_match(col("a"), lambda v: v > 3).alias("has_big")).show() +``` + + +If you need explicit control over parameter names, build the lambda with +[`lambda_`][datafusion.functions.lambda_] and reference its parameters with +[`lambda_var`][datafusion.functions.lambda_var]. The following is equivalent to the +`array_transform` call above. + +```python exec="1" source="material-block" result="text" session="expressions" +from datafusion import lit + +double_fn = f.lambda_(["v"], f.lambda_var("v") * lit(2)) +df.select(f.array_transform(col("a"), double_fn).alias("doubled")).show() +``` + + +
+

Note

+ +Lambda expressions cannot yet be serialized: calling +[`to_bytes`][datafusion.expr.Expr.to_bytes] or pickling an expression that +contains a lambda raises `Lambda not implemented`. SQL lambda syntax is +only parsed by dialects that support lambdas; set +`datafusion.sql_parser.dialect` to one of `DuckDB`, `ClickHouse`, +`Snowflake`, or `Databricks`. Both arrow syntax (`x -> x * 2`) and +keyword syntax (`lambda x: x * 2`) parse. DuckDB will drop the arrow +form in v2.1, so prefer `lambda x: x * 2` for forward compatibility. +The Python expression builder shown above works regardless of dialect. + +
+ +## Testing membership in a list + +A common need is filtering rows where a column equals *any* of a small set of +values. DataFusion offers three forms; they differ in readability and in how +they scale: + +1. A compound boolean using `|` across explicit equalities. +2. [`in_list`][datafusion.functions.in_list], which accepts a list of + expressions and tests equality against all of them in one call. +3. A trick with [`array_position`][datafusion.functions.array_position] and + [`make_array`][datafusion.functions.make_array], which returns the 1-based + index of the value in a constructed array, or null if it is not present. + +```python exec="1" source="material-block" result="text" session="expressions" +from datafusion import SessionContext, col, lit +from datafusion import functions as f + +ctx = SessionContext() +df = ctx.from_pydict({"shipmode": ["MAIL", "SHIP", "AIR", "TRUCK", "RAIL"]}) + +# Option 1: compound boolean. Fine for two values; awkward past three. +df.filter((col("shipmode") == lit("MAIL")) | (col("shipmode") == lit("SHIP"))) + +# Option 2: in_list. Preferred for readability as the set grows. +df.filter(f.in_list(col("shipmode"), [lit("MAIL"), lit("SHIP")])) + +# Option 3: array_position / make_array. Useful when you already have the +# set as an array column and want "is in that array" semantics. +df.filter( + ~f.array_position(f.make_array(lit("MAIL"), lit("SHIP")), col("shipmode")).is_null() +).show() +``` + + +Use `in_list` as the default. It is explicit, readable, and matches the +semantics users expect from SQL's `IN (...)`. Reach for the +`array_position` form only when the membership set is itself an array +column rather than a literal list. + +## Conditional expressions + +DataFusion provides [`case`][datafusion.functions.case] for the SQL +`CASE` expression in both its switched and searched forms, along with +[`when`][datafusion.functions.when] as a standalone builder for the +searched form. + +**Switched CASE** (one expression compared against several literal values): + +```python exec="1" source="material-block" result="text" session="expressions" +df = ctx.from_pydict( + {"priority": ["1-URGENT", "2-HIGH", "3-MEDIUM", "5-LOW"]}, +) + +df.select( + col("priority"), + f.case(col("priority")) + .when(lit("1-URGENT"), lit(1)) + .when(lit("2-HIGH"), lit(1)) + .otherwise(lit(0)) + .alias("is_high_priority"), +).show() +``` + + +**Searched CASE** (an independent boolean predicate per branch). Use this +form whenever a branch tests more than simple equality — for example, +checking whether a joined column is `NULL` to gate a computed value: + +```python exec="1" source="material-block" result="text" session="expressions" +df = ctx.from_pydict( + {"volume": [10.0, 20.0, 30.0], "supplier_id": [1, None, 2]}, +) + +df.select( + col("volume"), + col("supplier_id"), + f.when(col("supplier_id").is_not_null(), col("volume")) + .otherwise(lit(0.0)) + .alias("attributed_volume"), +).show() +``` + + +This searched-CASE pattern is idiomatic for "attribute the measure to the +matching side of a left join, otherwise contribute zero" — a shape that +appears in TPC-H Q08 and similar market-share calculations. + +If a switched CASE only groups several equality matches into one bucket, +`f.when(f.in_list(col(...), [...]), value).otherwise(default)` is often +simpler than the full `case` builder. + +## Structs + +Columns that contain struct elements can be accessed using the bracket notation as if they were +Python dictionary style objects. This expects a string key as the parameter passed. + +```python exec="1" source="material-block" result="text" session="expressions" +ctx = SessionContext() +data = {"a": [{"size": 15, "color": "green"}, {"size": 10, "color": "blue"}]} +df = ctx.from_pydict(data) +df.select(col("a")["size"].alias("a_size")).show() +``` + + +## Functions + +As mentioned before, most functions in DataFusion return an expression at their output. This allows us to create +a wide variety of expressions built up from other expressions. For example, [`alias`][datafusion.expr.Expr.alias] is a function that takes +as it input a single expression and returns an expression in which the name of the expression has changed. + +The following example shows a series of expressions that are built up from functions operating on expressions. + +```python exec="1" source="material-block" result="text" session="expressions" +from datafusion import SessionContext, lit +from datafusion import functions as f + +ctx = SessionContext() +df = ctx.from_pydict( + { + "name": ["Albert", "Becca", "Carlos", "Dante"], + "age": [42, 67, 27, 71], + "years_in_position": [13, 21, 10, 54], + }, + name="employees", +) + +age_col = col("age") +renamed_age = age_col.alias("age_in_years") +start_age = age_col - col("years_in_position") +started_young = start_age < lit(18) +can_retire = age_col > lit(65) +long_timer = started_young & can_retire + +df.filter(long_timer).select(col("name"), renamed_age, col("years_in_position")).show() +``` diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.rst deleted file mode 100644 index f52c79ddb..000000000 --- a/docs/source/user-guide/common-operations/expressions.rst +++ /dev/null @@ -1,337 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _expressions: - -Expressions -=========== - -In DataFusion an expression is an abstraction that represents a computation. -Expressions are used as the primary inputs and outputs for most functions within -DataFusion. As such, expressions can be combined to create expression trees, a -concept shared across most compilers and databases. - -Column ------- - -The first expression most new users will interact with is the Column, which is created by calling :py:func:`~datafusion.col`. -This expression represents a column within a DataFrame. The function :py:func:`~datafusion.col` takes as in input a string -and returns an expression as it's output. - -Literal -------- - -Literal expressions represent a single value. These are helpful in a wide range of operations where -a specific, known value is of interest. You can create a literal expression using the function :py:func:`~datafusion.lit`. -The type of the object passed to the :py:func:`~datafusion.lit` function will be used to convert it to a known data type. - -In the following example we create expressions for the column named `color` and the literal scalar string `red`. -The resultant variable `red_units` is itself also an expression. - -.. ipython:: python - - red_units = col("color") == lit("red") - -Boolean -------- - -When combining expressions that evaluate to a boolean value, you can combine these expressions using boolean operators. -It is important to note that in order to combine these expressions, you *must* use bitwise operators. See the following -examples for the and, or, and not operations. - - -.. ipython:: python - - red_or_green_units = (col("color") == lit("red")) | (col("color") == lit("green")) - heavy_red_units = (col("color") == lit("red")) & (col("weight") > lit(42)) - not_red_units = ~(col("color") == lit("red")) - -Arrays ------- - -For columns that contain arrays of values, you can access individual elements of the array by index -using bracket indexing. This is similar to calling the function -:py:func:`datafusion.functions.array_element`, except that array indexing using brackets is 0 based, -similar to Python arrays and ``array_element`` is 1 based indexing to be compatible with other SQL -approaches. - -.. ipython:: python - - from datafusion import SessionContext, col - - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5, 6]]}) - df.select(col("a")[0].alias("a0")) - -.. warning:: - - Indexing an element of an array via ``[]`` starts at index 0 whereas - :py:func:`~datafusion.functions.array_element` starts at index 1. - -Starting in DataFusion 49.0.0 you can also create slices of array elements using -slice syntax from Python. - -.. ipython:: python - - df.select(col("a")[1:3].alias("second_two_elements")) - -To check if an array is empty, you can use the function :py:func:`datafusion.functions.array_empty` or `datafusion.functions.empty`. -This function returns a boolean indicating whether the array is empty. - -.. ipython:: python - - from datafusion import SessionContext, col - from datafusion.functions import array_empty - - ctx = SessionContext() - df = ctx.from_pydict({"a": [[], [1, 2, 3]]}) - df.select(array_empty(col("a")).alias("is_empty")) - -In this example, the `is_empty` column will contain `True` for the first row and `False` for the second row. - -To get the total number of elements in an array, you can use the function :py:func:`datafusion.functions.cardinality`. -This function returns an integer indicating the total number of elements in the array. - -.. ipython:: python - - from datafusion import SessionContext, col - from datafusion.functions import cardinality - - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5, 6]]}) - df.select(cardinality(col("a")).alias("num_elements")) - -In this example, the `num_elements` column will contain `3` for both rows. - -To concatenate two arrays, you can use the function :py:func:`datafusion.functions.array_cat` or :py:func:`datafusion.functions.array_concat`. -These functions return a new array that is the concatenation of the input arrays. - -.. ipython:: python - - from datafusion import SessionContext, col - from datafusion.functions import array_cat, array_concat - - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[4, 5, 6]]}) - df.select(array_cat(col("a"), col("b")).alias("concatenated_array")) - -In this example, the `concatenated_array` column will contain `[1, 2, 3, 4, 5, 6]`. - -To repeat the elements of an array a specified number of times, you can use the function :py:func:`datafusion.functions.array_repeat`. -This function returns a new array with the elements repeated. - -.. ipython:: python - - from datafusion import SessionContext, col, literal - from datafusion.functions import array_repeat - - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3]]}) - df.select(array_repeat(col("a"), literal(2)).alias("repeated_array")) - -In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`. - -Lambda functions ----------------- - -Some array functions take a *lambda function*: a small function that runs once -per element. :py:func:`~datafusion.functions.array_transform` maps a lambda over -every element, :py:func:`~datafusion.functions.array_filter` keeps the elements -for which a predicate lambda is true, and -:py:func:`~datafusion.functions.array_any_match` returns whether any element -satisfies a predicate lambda. (Functions that take another function as an -argument are sometimes called *higher-order* functions.) - -The simplest way to supply a lambda is a Python ``lambda``. Its parameter names -become the lambda parameters, and its return value becomes the body. - -.. ipython:: python - - from datafusion import SessionContext, col - from datafusion import functions as f - - ctx = SessionContext() - df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5]]}) - df.select(f.array_transform(col("a"), lambda v: v * 2).alias("doubled")) - df.select(f.array_filter(col("a"), lambda v: v > 2).alias("big_only")) - df.select(f.array_any_match(col("a"), lambda v: v > 3).alias("has_big")) - -If you need explicit control over parameter names, build the lambda with -:py:func:`~datafusion.functions.lambda_` and reference its parameters with -:py:func:`~datafusion.functions.lambda_var`. The following is equivalent to the -``array_transform`` call above. - -.. ipython:: python - - from datafusion import lit - - double_fn = f.lambda_(["v"], f.lambda_var("v") * lit(2)) - df.select(f.array_transform(col("a"), double_fn).alias("doubled")) - -.. note:: - - Lambda expressions cannot yet be serialized: calling - :py:meth:`~datafusion.expr.Expr.to_bytes` or pickling an expression that - contains a lambda raises ``Lambda not implemented``. SQL lambda syntax is - only parsed by dialects that support lambdas; set - ``datafusion.sql_parser.dialect`` to one of ``DuckDB``, ``ClickHouse``, - ``Snowflake``, or ``Databricks``. Both arrow syntax (``x -> x * 2``) and - keyword syntax (``lambda x: x * 2``) parse. DuckDB will drop the arrow - form in v2.1, so prefer ``lambda x: x * 2`` for forward compatibility. - The Python expression builder shown above works regardless of dialect. - - -Testing membership in a list ----------------------------- - -A common need is filtering rows where a column equals *any* of a small set of -values. DataFusion offers three forms; they differ in readability and in how -they scale: - -1. A compound boolean using ``|`` across explicit equalities. -2. :py:func:`~datafusion.functions.in_list`, which accepts a list of - expressions and tests equality against all of them in one call. -3. A trick with :py:func:`~datafusion.functions.array_position` and - :py:func:`~datafusion.functions.make_array`, which returns the 1-based - index of the value in a constructed array, or null if it is not present. - -.. ipython:: python - - from datafusion import SessionContext, col, lit - from datafusion import functions as f - - ctx = SessionContext() - df = ctx.from_pydict({"shipmode": ["MAIL", "SHIP", "AIR", "TRUCK", "RAIL"]}) - - # Option 1: compound boolean. Fine for two values; awkward past three. - df.filter((col("shipmode") == lit("MAIL")) | (col("shipmode") == lit("SHIP"))) - - # Option 2: in_list. Preferred for readability as the set grows. - df.filter(f.in_list(col("shipmode"), [lit("MAIL"), lit("SHIP")])) - - # Option 3: array_position / make_array. Useful when you already have the - # set as an array column and want "is in that array" semantics. - df.filter( - ~f.array_position( - f.make_array(lit("MAIL"), lit("SHIP")), col("shipmode") - ).is_null() - ) - -Use ``in_list`` as the default. It is explicit, readable, and matches the -semantics users expect from SQL's ``IN (...)``. Reach for the -``array_position`` form only when the membership set is itself an array -column rather than a literal list. - -Conditional expressions ------------------------ - -DataFusion provides :py:func:`~datafusion.functions.case` for the SQL -``CASE`` expression in both its switched and searched forms, along with -:py:func:`~datafusion.functions.when` as a standalone builder for the -searched form. - -**Switched CASE** (one expression compared against several literal values): - -.. ipython:: python - - df = ctx.from_pydict( - {"priority": ["1-URGENT", "2-HIGH", "3-MEDIUM", "5-LOW"]}, - ) - - df.select( - col("priority"), - f.case(col("priority")) - .when(lit("1-URGENT"), lit(1)) - .when(lit("2-HIGH"), lit(1)) - .otherwise(lit(0)) - .alias("is_high_priority"), - ) - -**Searched CASE** (an independent boolean predicate per branch). Use this -form whenever a branch tests more than simple equality — for example, -checking whether a joined column is ``NULL`` to gate a computed value: - -.. ipython:: python - - df = ctx.from_pydict( - {"volume": [10.0, 20.0, 30.0], "supplier_id": [1, None, 2]}, - ) - - df.select( - col("volume"), - col("supplier_id"), - f.when(col("supplier_id").is_not_null(), col("volume")) - .otherwise(lit(0.0)) - .alias("attributed_volume"), - ) - -This searched-CASE pattern is idiomatic for "attribute the measure to the -matching side of a left join, otherwise contribute zero" — a shape that -appears in TPC-H Q08 and similar market-share calculations. - -If a switched CASE only groups several equality matches into one bucket, -``f.when(f.in_list(col(...), [...]), value).otherwise(default)`` is often -simpler than the full ``case`` builder. - -Structs -------- - -Columns that contain struct elements can be accessed using the bracket notation as if they were -Python dictionary style objects. This expects a string key as the parameter passed. - -.. ipython:: python - - ctx = SessionContext() - data = {"a": [{"size": 15, "color": "green"}, {"size": 10, "color": "blue"}]} - df = ctx.from_pydict(data) - df.select(col("a")["size"].alias("a_size")) - - -Functions ---------- - -As mentioned before, most functions in DataFusion return an expression at their output. This allows us to create -a wide variety of expressions built up from other expressions. For example, :py:func:`~datafusion.expr.Expr.alias` is a function that takes -as it input a single expression and returns an expression in which the name of the expression has changed. - -The following example shows a series of expressions that are built up from functions operating on expressions. - -.. ipython:: python - - from datafusion import SessionContext - from datafusion import column, lit - from datafusion import functions as f - import random - - ctx = SessionContext() - df = ctx.from_pydict( - { - "name": ["Albert", "Becca", "Carlos", "Dante"], - "age": [42, 67, 27, 71], - "years_in_position": [13, 21, 10, 54], - }, - name="employees" - ) - - age_col = col("age") - renamed_age = age_col.alias("age_in_years") - start_age = age_col - col("years_in_position") - started_young = start_age < lit(18) - can_retire = age_col > lit(65) - long_timer = started_young & can_retire - - df.filter(long_timer).select(col("name"), renamed_age, col("years_in_position")) diff --git a/docs/source/user-guide/common-operations/functions.md b/docs/source/user-guide/common-operations/functions.md new file mode 100644 index 000000000..cef71e65e --- /dev/null +++ b/docs/source/user-guide/common-operations/functions.md @@ -0,0 +1,148 @@ + + +# Functions + +DataFusion provides a large number of built-in functions for performing complex queries without requiring user-defined functions. +In here we will cover some of the more popular use cases. If you want to view all the functions go to the [`Functions`][datafusion.functions] API Reference. + +We'll use the pokemon dataset in the following examples. + +```python exec="1" source="material-block" session="functions" +ctx = SessionContext() +ctx.register_csv("pokemon", "pokemon.csv") +df = ctx.table("pokemon") +``` + + +## Mathematical + +DataFusion offers mathematical functions such as [`pow`][datafusion.functions.pow] or [`log`][datafusion.functions.log] + +```python exec="1" source="material-block" result="text" session="functions" +from datafusion import str_lit, string_literal + +df.select( + f.pow(col('"Attack"'), literal(2)) - f.pow(col('"Defense"'), literal(2)) +).limit(10).show() +``` + + +## Conditional + +There 3 conditional functions in DataFusion [`coalesce`][datafusion.functions.coalesce], [`nullif`][datafusion.functions.nullif] and [`case`][datafusion.functions.case]. + +```python exec="1" source="material-block" result="text" session="functions" +df.select(f.coalesce(col('"Type 1"'), col('"Type 2"')).alias("dominant_type")).limit(10).show() +``` + + +## Temporal + +For selecting the current time use [`now`][datafusion.functions.now] + +```python exec="1" source="material-block" result="text" session="functions" +df.select(f.now()).show() +``` + + +Convert to timestamps using [`to_timestamp`][datafusion.functions.to_timestamp] + +```python exec="1" source="material-block" result="text" session="functions" +df.select(f.to_timestamp(col('"Total"')).alias("timestamp")).show() +``` + + +Extracting parts of a date using [`date_part`][datafusion.functions.date_part] (alias [`extract`][datafusion.functions.extract]) + +```python exec="1" source="material-block" result="text" session="functions" +df.select( + f.date_part(literal("month"), f.to_timestamp(col('"Total"'))).alias("month"), + f.extract(literal("day"), f.to_timestamp(col('"Total"'))).alias("day"), +).show() +``` + + +## String + +In the field of data science, working with textual data is a common task. To make string manipulation easier, +DataFusion offers a range of helpful options. + +```python exec="1" source="material-block" result="text" session="functions" +df.select( + f.char_length(col('"Name"')).alias("len"), + f.lower(col('"Name"')).alias("lower"), + f.left(col('"Name"'), literal(4)).alias("code"), +).show() +``` + + +This also includes the functions for regular expressions like [`regexp_replace`][datafusion.functions.regexp_replace] and [`regexp_match`][datafusion.functions.regexp_match] + +```python exec="1" source="material-block" result="text" session="functions" +df.select( + f.regexp_match(col('"Name"'), literal("Char")).alias("dragons"), + f.regexp_replace(col('"Name"'), literal("saur"), literal("fleur")).alias("flowers"), +).show() +``` + + +## Casting + +Casting expressions to different data types using [`arrow_cast`][datafusion.functions.arrow_cast] + +```python exec="1" source="material-block" result="text" session="functions" +df.select( + f.arrow_cast(col('"Total"'), string_literal("Float64")).alias("total_as_float"), + f.arrow_cast(col('"Total"'), str_lit("Int32")).alias("total_as_int"), +).show() +``` + + +## Other + +The function [`in_list`][datafusion.functions.in_list] allows to check a column for the presence of multiple values: + +```python exec="1" source="material-block" result="text" session="functions" +types = [literal("Grass"), literal("Fire"), literal("Water")] +print(( + df.select(f.in_list(col('"Type 1"'), types, negated=False).alias("basic_types")) + .limit(20) + .to_pandas() +)) +``` + + +# Handling Missing Values + +DataFusion provides methods to handle missing values in DataFrames: + +## fill_null + +The `fill_null()` method replaces NULL values in specified columns with a provided value: + +```python +# Fill all NULL values with 0 where possible +df = df.fill_null(0) + +# Fill NULL values only in specific string columns +df = df.fill_null("missing", subset=["name", "category"]) +``` + +The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged. diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.rst deleted file mode 100644 index ccb47a4e7..000000000 --- a/docs/source/user-guide/common-operations/functions.rst +++ /dev/null @@ -1,152 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Functions -========= - -DataFusion provides a large number of built-in functions for performing complex queries without requiring user-defined functions. -In here we will cover some of the more popular use cases. If you want to view all the functions go to the :py:mod:`Functions ` API Reference. - -We'll use the pokemon dataset in the following examples. - -.. ipython:: python - - from datafusion import SessionContext - - ctx = SessionContext() - ctx.register_csv("pokemon", "pokemon.csv") - df = ctx.table("pokemon") - -Mathematical ------------- - -DataFusion offers mathematical functions such as :py:func:`~datafusion.functions.pow` or :py:func:`~datafusion.functions.log` - -.. ipython:: python - - from datafusion import col, literal, string_literal, str_lit - from datafusion import functions as f - - df.select( - f.pow(col('"Attack"'), literal(2)) - f.pow(col('"Defense"'), literal(2)) - ).limit(10) - - -Conditional ------------ - -There 3 conditional functions in DataFusion :py:func:`~datafusion.functions.coalesce`, :py:func:`~datafusion.functions.nullif` and :py:func:`~datafusion.functions.case`. - -.. ipython:: python - - df.select( - f.coalesce(col('"Type 1"'), col('"Type 2"')).alias("dominant_type") - ).limit(10) - -Temporal --------- - -For selecting the current time use :py:func:`~datafusion.functions.now` - -.. ipython:: python - - df.select(f.now()) - -Convert to timestamps using :py:func:`~datafusion.functions.to_timestamp` - -.. ipython:: python - - df.select(f.to_timestamp(col('"Total"')).alias("timestamp")) - -Extracting parts of a date using :py:func:`~datafusion.functions.date_part` (alias :py:func:`~datafusion.functions.extract`) - -.. ipython:: python - - df.select( - f.date_part(literal("month"), f.to_timestamp(col('"Total"'))).alias("month"), - f.extract(literal("day"), f.to_timestamp(col('"Total"'))).alias("day") - ) - -String ------- - -In the field of data science, working with textual data is a common task. To make string manipulation easier, -DataFusion offers a range of helpful options. - -.. ipython:: python - - df.select( - f.char_length(col('"Name"')).alias("len"), - f.lower(col('"Name"')).alias("lower"), - f.left(col('"Name"'), literal(4)).alias("code") - ) - -This also includes the functions for regular expressions like :py:func:`~datafusion.functions.regexp_replace` and :py:func:`~datafusion.functions.regexp_match` - -.. ipython:: python - - df.select( - f.regexp_match(col('"Name"'), literal("Char")).alias("dragons"), - f.regexp_replace(col('"Name"'), literal("saur"), literal("fleur")).alias("flowers") - ) - -Casting -------- - -Casting expressions to different data types using :py:func:`~datafusion.functions.arrow_cast` - -.. ipython:: python - - df.select( - f.arrow_cast(col('"Total"'), string_literal("Float64")).alias("total_as_float"), - f.arrow_cast(col('"Total"'), str_lit("Int32")).alias("total_as_int") - ) - -Other ------ - -The function :py:func:`~datafusion.functions.in_list` allows to check a column for the presence of multiple values: - -.. ipython:: python - - types = [literal("Grass"), literal("Fire"), literal("Water")] - ( - df.select(f.in_list(col('"Type 1"'), types, negated=False).alias("basic_types")) - .limit(20) - .to_pandas() - ) - - -Handling Missing Values -======================= - -DataFusion provides methods to handle missing values in DataFrames: - -fill_null ---------- - -The ``fill_null()`` method replaces NULL values in specified columns with a provided value: - -.. code-block:: python - - # Fill all NULL values with 0 where possible - df = df.fill_null(0) - - # Fill NULL values only in specific string columns - df = df.fill_null("missing", subset=["name", "category"]) - -The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged. diff --git a/docs/source/user-guide/common-operations/index.md b/docs/source/user-guide/common-operations/index.md new file mode 100644 index 000000000..3a314085b --- /dev/null +++ b/docs/source/user-guide/common-operations/index.md @@ -0,0 +1,41 @@ + + +# Common Operations + +The contents of this section are designed to guide a new user through how to use DataFusion. + +## Contents + +- [Basic Info](basic-info.md) — inspecting schema, row counts, and + summary statistics. +- [Views](views.md) — saving and reusing query fragments as views. +- [Select and Filter](select-and-filter.md) — projecting columns and + applying predicates. +- [Expressions](expressions.md) — `col`, `lit`, boolean operators, + array indexing, and chaining. +- [Joins](joins.md) — inner / outer / semi / anti joins. +- [Functions](functions.md) — scalar functions across math, string, + date/time, and array families. +- [Aggregations](aggregations.md) — `group_by`, rollup, cube, + grouping sets. +- [Windows](windows.md) — partitioned and ranking window functions. +- [User-Defined Functions](udf-and-udfa.md) — scalar (UDF), + aggregate (UDAF), window (UDWF), and table (UDTF) user-defined + functions. diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst deleted file mode 100644 index 7abd1f138..000000000 --- a/docs/source/user-guide/common-operations/index.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Common Operations -================= - -The contents of this section are designed to guide a new user through how to use DataFusion. - -.. toctree:: - :maxdepth: 2 - - views - basic-info - select-and-filter - expressions - joins - functions - aggregations - windows - udf-and-udfa diff --git a/docs/source/user-guide/common-operations/joins.md b/docs/source/user-guide/common-operations/joins.md new file mode 100644 index 000000000..4f38005e7 --- /dev/null +++ b/docs/source/user-guide/common-operations/joins.md @@ -0,0 +1,171 @@ + + +# Joins + +DataFusion supports the following join variants via the method [`join`][datafusion.dataframe.DataFrame.join] + +- Inner Join +- Left Join +- Right Join +- Full Join +- Left Semi Join +- Left Anti Join + +For the examples in this section we'll use the following two DataFrames + +```python exec="1" source="material-block" session="joins" +ctx = SessionContext() + +left = ctx.from_pydict( + { + "customer_id": [1, 2, 3], + "customer": ["Alice", "Bob", "Charlie"], + } +) + +right = ctx.from_pylist( + [ + {"id": 1, "name": "CityCabs"}, + {"id": 2, "name": "MetroRide"}, + {"id": 5, "name": "UrbanGo"}, + ] +) +``` + + +## Inner Join + +When using an inner join, only rows containing the common values between the two join columns present in both DataFrames +will be included in the resulting DataFrame. + +```python exec="1" source="material-block" result="text" session="joins" +print(left.join(right, left_on="customer_id", right_on="id", how="inner")) +``` + + +The parameter `join_keys` specifies the columns from the left DataFrame and right DataFrame that contains the values +that should match. + +## Left Join + +A left join combines rows from two DataFrames using the key columns. It returns all rows from the left DataFrame and +matching rows from the right DataFrame. If there's no match in the right DataFrame, it returns null +values for the corresponding columns. + +```python exec="1" source="material-block" result="text" session="joins" +print(left.join(right, left_on="customer_id", right_on="id", how="left")) +``` + + +## Full Join + +A full join merges rows from two tables based on a related column, returning all rows from both tables, even if there +is no match. Unmatched rows will have null values. + +```python exec="1" source="material-block" result="text" session="joins" +print(left.join(right, left_on="customer_id", right_on="id", how="full")) +``` + + +## Left Semi Join + +A left semi join retrieves matching rows from the left table while +omitting duplicates with multiple matches in the right table. + +```python exec="1" source="material-block" result="text" session="joins" +print(left.join(right, left_on="customer_id", right_on="id", how="semi")) +``` + + +## Left Anti Join + +A left anti join shows all rows from the left table without any matching rows in the right table, +based on a the specified matching columns. It excludes rows from the left table that have at least one matching row in +the right table. + +```python exec="1" source="material-block" result="text" session="joins" +print(left.join(right, left_on="customer_id", right_on="id", how="anti")) +``` + + +## Duplicate Keys + +It is common to join two DataFrames on a common column name. Starting in +version 51.0.0, `` datafusion-python` `` will now coalesce on column with identical names by +default. This reduces problems with ambiguous column selection after joins. +You can disable this feature by setting the parameter `coalesce_duplicate_keys` +to `False`. + +```python exec="1" source="material-block" result="text" session="joins" +left = ctx.from_pydict( + { + "id": [1, 2, 3], + "customer": ["Alice", "Bob", "Charlie"], + } +) + +right = ctx.from_pylist( + [ + {"id": 1, "name": "CityCabs"}, + {"id": 2, "name": "MetroRide"}, + {"id": 5, "name": "UrbanGo"}, + ] +) + +print(left.join(right, "id", how="inner")) +``` + + +In contrast to the above example, if we wish to get both columns: + +```python exec="1" source="material-block" result="text" session="joins" +print(left.join(right, "id", how="inner", coalesce_duplicate_keys=False)) +``` + + +## Disambiguating Columns with `DataFrame.col()` + +When both DataFrames contain non-key columns with the same name, you can use +[`col`][datafusion.dataframe.DataFrame.col] on each DataFrame **before** the +join to create fully qualified column references. These references can then be +used in the join predicate and when selecting from the result. + +This is especially useful with [`join_on`][datafusion.dataframe.DataFrame.join_on], +which accepts expression-based predicates. + +```python exec="1" source="material-block" result="text" session="joins" +left = ctx.from_pydict( + { + "id": [1, 2, 3], + "val": [10, 20, 30], + } +) + +right = ctx.from_pydict( + { + "id": [1, 2, 3], + "val": [40, 50, 60], + } +) + +joined = left.join_on(right, left.col("id") == right.col("id"), how="inner") + +print(joined.select(left.col("id"), left.col("val"), right.col("val"))) +``` diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.rst deleted file mode 100644 index a289c9377..000000000 --- a/docs/source/user-guide/common-operations/joins.rst +++ /dev/null @@ -1,169 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Joins -===== - -DataFusion supports the following join variants via the method :py:func:`~datafusion.dataframe.DataFrame.join` - -- Inner Join -- Left Join -- Right Join -- Full Join -- Left Semi Join -- Left Anti Join - -For the examples in this section we'll use the following two DataFrames - -.. ipython:: python - - from datafusion import SessionContext - - ctx = SessionContext() - - left = ctx.from_pydict( - { - "customer_id": [1, 2, 3], - "customer": ["Alice", "Bob", "Charlie"], - } - ) - - right = ctx.from_pylist([ - {"id": 1, "name": "CityCabs"}, - {"id": 2, "name": "MetroRide"}, - {"id": 5, "name": "UrbanGo"}, - ]) - -Inner Join ----------- - -When using an inner join, only rows containing the common values between the two join columns present in both DataFrames -will be included in the resulting DataFrame. - -.. ipython:: python - - left.join(right, left_on="customer_id", right_on="id", how="inner") - -The parameter ``join_keys`` specifies the columns from the left DataFrame and right DataFrame that contains the values -that should match. - -Left Join ---------- - -A left join combines rows from two DataFrames using the key columns. It returns all rows from the left DataFrame and -matching rows from the right DataFrame. If there's no match in the right DataFrame, it returns null -values for the corresponding columns. - -.. ipython:: python - - left.join(right, left_on="customer_id", right_on="id", how="left") - -Full Join ---------- - -A full join merges rows from two tables based on a related column, returning all rows from both tables, even if there -is no match. Unmatched rows will have null values. - -.. ipython:: python - - left.join(right, left_on="customer_id", right_on="id", how="full") - -Left Semi Join --------------- - -A left semi join retrieves matching rows from the left table while -omitting duplicates with multiple matches in the right table. - -.. ipython:: python - - left.join(right, left_on="customer_id", right_on="id", how="semi") - -Left Anti Join --------------- - -A left anti join shows all rows from the left table without any matching rows in the right table, -based on a the specified matching columns. It excludes rows from the left table that have at least one matching row in -the right table. - -.. ipython:: python - - left.join(right, left_on="customer_id", right_on="id", how="anti") - -Duplicate Keys --------------- - -It is common to join two DataFrames on a common column name. Starting in -version 51.0.0, ``datafusion-python``` will now coalesce on column with identical names by -default. This reduces problems with ambiguous column selection after joins. -You can disable this feature by setting the parameter ``coalesce_duplicate_keys`` -to ``False``. - -.. ipython:: python - - left = ctx.from_pydict( - { - "id": [1, 2, 3], - "customer": ["Alice", "Bob", "Charlie"], - } - ) - - right = ctx.from_pylist([ - {"id": 1, "name": "CityCabs"}, - {"id": 2, "name": "MetroRide"}, - {"id": 5, "name": "UrbanGo"}, - ]) - - left.join(right, "id", how="inner") - -In contrast to the above example, if we wish to get both columns: - -.. ipython:: python - - left.join(right, "id", how="inner", coalesce_duplicate_keys=False) - -Disambiguating Columns with ``DataFrame.col()`` ------------------------------------------------- - -When both DataFrames contain non-key columns with the same name, you can use -:py:meth:`~datafusion.dataframe.DataFrame.col` on each DataFrame **before** the -join to create fully qualified column references. These references can then be -used in the join predicate and when selecting from the result. - -This is especially useful with :py:meth:`~datafusion.dataframe.DataFrame.join_on`, -which accepts expression-based predicates. - -.. ipython:: python - - left = ctx.from_pydict( - { - "id": [1, 2, 3], - "val": [10, 20, 30], - } - ) - - right = ctx.from_pydict( - { - "id": [1, 2, 3], - "val": [40, 50, 60], - } - ) - - joined = left.join_on( - right, left.col("id") == right.col("id"), how="inner" - ) - - joined.select(left.col("id"), left.col("val"), right.col("val")) diff --git a/docs/source/user-guide/common-operations/select-and-filter.md b/docs/source/user-guide/common-operations/select-and-filter.md new file mode 100644 index 000000000..32d762b90 --- /dev/null +++ b/docs/source/user-guide/common-operations/select-and-filter.md @@ -0,0 +1,65 @@ + + +# Column Selections + +Use [`select`][datafusion.dataframe.DataFrame.select] for basic column selection. + +DataFusion can work with several file types, to start simple we can use a subset of the +[TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page), +which you can download [here](https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet). + +```python exec="1" source="material-block" result="text" session="select-and-filter" +ctx = SessionContext() +df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") +df.select("trip_distance", "passenger_count").show() +``` + + +For mathematical or logical operations use [`col`][datafusion.col.col] to select columns, and give meaningful names to the resulting +operations using [`alias`][datafusion.expr.Expr.alias] + +```python exec="1" source="material-block" result="text" session="select-and-filter" +df.select((col("tip_amount") + col("tolls_amount")).alias("tips_plus_tolls")).show() +``` + + +
+

Warning

+ +Please be aware that all identifiers are effectively made lower-case in SQL, so if your file has capital letters +(ex: Name) you must put your column name in double quotes or the selection won’t work. As an alternative for simple +column selection use [`select`][datafusion.dataframe.DataFrame.select] without double quotes + +
+ +For selecting columns with capital letters use `'"VendorID"'` + +```python exec="1" source="material-block" result="text" session="select-and-filter" +df.select(col('"VendorID"')).show() +``` + + +To combine it with literal values use the [`lit`][datafusion.lit] + +```python exec="1" source="material-block" result="text" session="select-and-filter" +large_trip_distance = col("trip_distance") > lit(5.0) +low_passenger_count = col("passenger_count") < lit(4) +df.select((large_trip_distance & low_passenger_count).alias("lonely_trips")).show() +``` diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst b/docs/source/user-guide/common-operations/select-and-filter.rst deleted file mode 100644 index 083bcbbd2..000000000 --- a/docs/source/user-guide/common-operations/select-and-filter.rst +++ /dev/null @@ -1,64 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Column Selections -================= - -Use :py:func:`~datafusion.dataframe.DataFrame.select` for basic column selection. - -DataFusion can work with several file types, to start simple we can use a subset of the -`TLC Trip Record Data `_, -which you can download `here `_. - -.. ipython:: python - - from datafusion import SessionContext - - ctx = SessionContext() - df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") - df.select("trip_distance", "passenger_count") - -For mathematical or logical operations use :py:func:`~datafusion.col` to select columns, and give meaningful names to the resulting -operations using :py:func:`~datafusion.expr.Expr.alias` - - -.. ipython:: python - - from datafusion import col, lit - df.select((col("tip_amount") + col("tolls_amount")).alias("tips_plus_tolls")) - -.. warning:: - - Please be aware that all identifiers are effectively made lower-case in SQL, so if your file has capital letters - (ex: Name) you must put your column name in double quotes or the selection won’t work. As an alternative for simple - column selection use :py:func:`~datafusion.dataframe.DataFrame.select` without double quotes - -For selecting columns with capital letters use ``'"VendorID"'`` - -.. ipython:: python - - df.select(col('"VendorID"')) - - -To combine it with literal values use the :py:func:`~datafusion.lit` - -.. ipython:: python - - large_trip_distance = col("trip_distance") > lit(5.0) - low_passenger_count = col("passenger_count") < lit(4) - df.select((large_trip_distance & low_passenger_count).alias("lonely_trips")) - diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.md b/docs/source/user-guide/common-operations/udf-and-udfa.md new file mode 100644 index 000000000..11fac828b --- /dev/null +++ b/docs/source/user-guide/common-operations/udf-and-udfa.md @@ -0,0 +1,460 @@ + + +# User-Defined Functions + +DataFusion provides powerful expressions and functions, reducing the need for custom Python +functions. However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs). + +## Scalar Functions + +When writing a user-defined function that can operate on a row by row basis, these are called Scalar +Functions. You can define your own scalar function by calling +[`udf`][datafusion.user_defined.ScalarUDF.udf] . + +The basic definition of a scalar UDF is a python function that takes one or more +[pyarrow](https://arrow.apache.org/docs/python/index.html) arrays and returns a single array as +output. DataFusion scalar UDFs operate on an entire batch of records at a time, though the +evaluation of those records should be on a row by row basis. In the following example, we compute +if the input array contains null values. + +```python exec="1" source="material-block" result="text" session="udf-and-udfa" +import pyarrow +from datafusion import udf + + +def is_null(array: pyarrow.Array) -> pyarrow.Array: + return array.is_null() + + +is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), "stable") + +ctx = datafusion.SessionContext() + +batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, None, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], +) +df = ctx.create_dataframe([[batch]], name="batch_array") + +df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show() +``` + + +In the previous example, we used the fact that pyarrow provides a variety of built in array +functions such as `is_null()`. There are additional pyarrow +[compute functions](https://arrow.apache.org/docs/python/compute.html) available. When possible, +it is highly recommended to use these functions because they can perform computations without doing +any copy operations from the original arrays. This leads to greatly improved performance. + +If you need to perform an operation in python that is not available with the pyarrow compute +functions, you will need to convert the record batch into python values, perform your operation, +and construct an array. This operation of converting the built in data type of the array into a +python object can be one of the slowest operations in DataFusion, so it should be done sparingly. + +The following example performs the same operation as before with `is_null` but demonstrates +converting to Python objects to do the evaluation. + +```python exec="1" source="material-block" result="text" session="udf-and-udfa" +import datafusion +import pyarrow +from datafusion import col, udf + + +def is_null(array: pyarrow.Array) -> pyarrow.Array: + return pyarrow.array([value.as_py() is None for value in array]) + + +is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), "stable") + +ctx = datafusion.SessionContext() + +batch = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1, None, 3]), pyarrow.array([4, 5, 6])], + names=["a", "b"], +) +df = ctx.create_dataframe([[batch]], name="batch_array") + +df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show() +``` + + +In this example we passed the PyArrow `DataType` when we defined the function +by calling `udf()`. If you need additional control, such as specifying +metadata or nullability of the input or output, you can instead specify a +PyArrow `Field`. + +If you need to write a custom function but do not want to incur the performance +cost of converting to Python objects and back, a more advanced approach is to +write Rust based UDFs and to expose them to Python. There is an example in the +[DataFusion blog](https://datafusion.apache.org/blog/2024/11/19/datafusion-python-udf-comparisons/) +describing how to do this. + +### When not to use a UDF + +A UDF is the right tool when the per-row computation genuinely cannot be +expressed with DataFusion's built-in expressions. It is often the *wrong* +tool for a predicate that *can* be written as an `Expr` tree but feels +easier to write as a Python function — for example, a filter that keeps +a row if it matches any one of several rule sets, where each rule set +checks its own combination of columns (the worked example at the end of +this section keeps a row when it matches any one of several brand-specific +rules). Looping over the rules in Python and returning a boolean per row +reads naturally and is tempting to wrap in a UDF, but a UDF is opaque to +the optimizer: filters expressed as UDFs lose several rewrites that the +engine applies to filters built from native expressions. The most visible +of these is **predicate pushdown into the table provider**: a native +predicate can be handed to the source so it skips data before it is read, +while a UDF predicate cannot. The example below uses Parquet, where +pushdown prunes whole row groups using the min/max statistics in the +footer, but the same mechanism applies to any table provider that +advertises filter support — including custom providers. + +The following example writes a small Parquet file, then filters it two +ways: first with a native expression, then with a UDF that computes the +same result. The filter itself is simple on purpose so we can compare +the plans side by side. + +```python exec="1" source="material-block" session="udf-and-udfa" +import os +import tempfile + +import pyarrow as pa +import pyarrow.parquet as pq +from datafusion import col, udf + +tmpdir = tempfile.mkdtemp() +parquet_path = os.path.join(tmpdir, "items.parquet") +pq.write_table( + pa.table( + { + "id": list(range(100)), + "brand": ["A", "B", "C", "D"] * 25, + "qty": [i * 10 for i in range(100)], + } + ), + parquet_path, +) + +ctx = SessionContext() +items = ctx.read_parquet(parquet_path) +``` + + +**Native-expression predicate.** The filter is a plain boolean tree +over column references and literals, so the optimizer can analyze it: + +```python exec="1" source="material-block" result="text" session="udf-and-udfa" +native_filtered = items.filter((col("brand") == lit("A")) & (col("qty") >= lit(150))) +print(native_filtered.execution_plan().display_indent()) +``` + + +Notice the `DataSourceExec` line. It carries three annotations the +optimizer computed from the predicate: + +- `predicate=brand@1 = A AND qty@2 >= 150` — the filter is pushed + into the Parquet scan itself, so the scan only reads matching rows. +- `pruning_predicate=... brand_min@0 <= A AND A <= brand_max@1 ... + qty_max@4 >= 150` — the scan prunes whole row groups by consulting + the Parquet min/max statistics in the footer *before* reading any + column data. +- `required_guarantees=[brand in (A)]` — the scan uses this when a + bloom filter or dictionary is available to skip pages. + +**UDF predicate.** Now wrap the same logic in a Python UDF: + +```python exec="1" source="material-block" result="text" session="udf-and-udfa" +def brand_qty_filter(brand_arr: pa.Array, qty_arr: pa.Array) -> pa.Array: + return pa.array( + [b.as_py() == "A" and q.as_py() >= 150 for b, q in zip(brand_arr, qty_arr)] + ) + + +pred_udf = udf( + brand_qty_filter, + [pa.string(), pa.int64()], + pa.bool_(), + "stable", +) +udf_filtered = items.filter(pred_udf(col("brand"), col("qty"))) +print(udf_filtered.execution_plan().display_indent()) +``` + + +The `DataSourceExec` now carries only `predicate=brand_qty_filter(...)`. +There is no `pruning_predicate` and no `required_guarantees`: the +scan has to materialize every row group and hand each row to the +Python callback just to decide whether to keep it. + +At small scale the cost difference is invisible; on a Parquet file with +many row groups, or data whose min/max statistics line up well with +the predicate, the native form can skip most of the file. The UDF form +reads all of it. + +**Takeaway.** Reach for a UDF when the per-row computation is genuinely +not expressible as a tree of built-in functions (custom numerical work, +external lookups, complex business rules). When it *is* expressible — +even if the native form is a little more verbose — build the `Expr` +tree directly so the optimizer can see through it. For disjunctive +predicates the idiom is to produce one clause per bucket and combine +them with `|`: + +```python +from functools import reduce +from operator import or_ +from datafusion import col, lit, functions as f + +buckets = { + "Brand#12": {"containers": ["SM CASE", "SM BOX"], "min_qty": 1, "max_size": 5}, + "Brand#23": {"containers": ["MED BAG", "MED BOX"], "min_qty": 10, "max_size": 10}, +} + +def bucket_clause(brand, spec): + return ( + (col("brand") == lit(brand)) + & f.in_list(col("container"), [lit(c) for c in spec["containers"]]) + & (col("quantity") >= lit(spec["min_qty"])) + & (col("quantity") <= lit(spec["min_qty"] + 10)) + & (col("size") >= lit(1)) + & (col("size") <= lit(spec["max_size"])) + ) + +predicate = reduce(or_, (bucket_clause(b, s) for b, s in buckets.items())) +df = df.filter(predicate) +``` + +## Aggregate Functions + +The [`udaf`][datafusion.user_defined.AggregateUDF.udaf] function allows you to define User-Defined +Aggregate Functions (UDAFs). To use this you must implement an +[`Accumulator`][datafusion.user_defined.Accumulator] that determines how the aggregation is performed. + +When defining a UDAF there are four methods you need to implement. The `update` function takes the +array(s) of input and updates the internal state of the accumulator. You should define this function +to have as many input arguments as you will pass when calling the UDAF. Since aggregation may be +split into multiple batches, we must have a method to combine multiple batches. For this, we have +two functions, `state` and `merge`. `state` will return an array of scalar values that contain +the current state of a single batch accumulation. Then we must `merge` the results of these +different states. Finally `evaluate` is the call that will return the final result after the +`merge` is complete. + +In the following example we want to define a custom aggregate function that will return the +difference between the sum of two columns. The state can be represented by a single value and we can +also see how the inputs to `update` and `merge` differ. + +```python +import pyarrow as pa +import pyarrow.compute +import datafusion +from datafusion import col, udaf, Accumulator +from typing import List + +class MyAccumulator(Accumulator): + """ + Interface of a user-defined accumulation. + """ + def __init__(self): + self._sum = 0.0 + + def update(self, values_a: pa.Array, values_b: pa.Array) -> None: + self._sum = self._sum + pyarrow.compute.sum(values_a).as_py() - pyarrow.compute.sum(values_b).as_py() + + def merge(self, states: list[pa.Array]) -> None: + self._sum = self._sum + pyarrow.compute.sum(states[0]).as_py() + + def state(self) -> list[pa.Scalar]: + return [pyarrow.scalar(self._sum)] + + def evaluate(self) -> pa.Scalar: + return pyarrow.scalar(self._sum) + +ctx = datafusion.SessionContext() +df = ctx.from_pydict( + { + "a": [4, 5, 6], + "b": [1, 2, 3], + } +) + +my_udaf = udaf(MyAccumulator, [pa.float64(), pa.float64()], pa.float64(), [pa.float64()], 'stable') + +df.aggregate([], [my_udaf(col("a"), col("b")).alias("col_diff")]) +``` + +### FAQ + +**How do I return a list from a UDAF?** + +Both the `evaluate` and the `state` functions expect to return scalar values. +If you wish to return a list array as a scalar value, the best practice is to +wrap the values in a `pyarrow.Scalar` object. For example, you can return a +timestamp list with `pa.scalar([...], type=pa.list_(pa.timestamp("ms")))` and +register the appropriate return or state types as +`return_type=pa.list_(pa.timestamp("ms"))` and +`state_type=[pa.list_(pa.timestamp("ms"))]`, respectively. + +As of DataFusion 52.0.0 , you can pass return any Python object, including a +PyArrow array, as the return value(s) for these functions and DataFusion will +attempt to create a scalar type from the value. DataFusion has been tested to +convert PyArrow, nanoarrow, and arro3 objects as well as primitive data types +like integers, strings, and so on. + +## Window Functions + +To implement a User-Defined Window Function (UDWF) you must call the +[`udwf`][datafusion.user_defined.WindowUDF.udwf] function using a class that implements the abstract +class [`WindowEvaluator`][datafusion.user_defined.WindowEvaluator]. + +There are three methods of evaluation of UDWFs. + +- `evaluate` is the simplest case, where you are given an array and are expected to calculate the + value for a single row of that array. This is the simplest case, but also the least performant. +- `evaluate_all` computes the values for all rows for an input array at a single time. +- `evaluate_all_with_rank` computes the values for all rows, but you only have the rank + information for the rows. + +Which methods you implement are based upon which of these options are set. + +| `uses_window_frame` | `supports_bounded_execution` | `include_rank` | function_to_implement | +|---|---|---|---| +| False (default) | False (default) | False (default) | `evaluate_all` | +| False | True | False | `evaluate` | +| False | True | False | `evaluate_all_with_rank` | +| True | True/False | True/False | `evaluate` | + +### UDWF options + +When you define your UDWF you can override the functions that return these values. They will +determine which evaluate functions are called. + +- `uses_window_frame` is set for functions that compute based on the specified window frame. If + your function depends upon the specified frame, set this to `True`. +- `supports_bounded_execution` specifies if your function can be incrementally computed. +- `include_rank` is set to `True` for window functions that can be computed only using the rank + information. + +```python +import pyarrow as pa +from datafusion import udwf, col, SessionContext +from datafusion.user_defined import WindowEvaluator + +class ExponentialSmooth(WindowEvaluator): + def __init__(self, alpha: float) -> None: + self.alpha = alpha + + def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: + results = [] + curr_value = 0.0 + values = values[0] + for idx in range(num_rows): + if idx == 0: + curr_value = values[idx].as_py() + else: + curr_value = values[idx].as_py() * self.alpha + curr_value * ( + 1.0 - self.alpha + ) + results.append(curr_value) + + return pa.array(results) + +exp_smooth = udwf( + ExponentialSmooth(0.9), + pa.float64(), + pa.float64(), + volatility="immutable", +) + +ctx = SessionContext() + +df = ctx.from_pydict({ + "a": [1.0, 2.1, 2.9, 4.0, 5.1, 6.0, 6.9, 8.0] +}) + +df.select("a", exp_smooth(col("a")).alias("smooth_a")).show() +``` + +## Table Functions + +User Defined Table Functions are slightly different than the other functions +described here. These functions take any number of `Expr` arguments, but only +literal expressions are supported. Table functions must return a Table +Provider as described in the ref:`_io_custom_table_provider` page. + +Once you have a table function, you can register it with the session context +by using [`register_udtf`][datafusion.context.SessionContext.register_udtf]. + +There are examples of both rust backed and python based table functions in the +examples folder of the repository. If you have a rust backed table function +that you wish to expose via PyO3, you need to expose it as a `PyCapsule`. + +```rust +#[pymethods] +impl MyTableFunction { + fn __datafusion_table_function__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_table_function".into(); + + let func = self.clone(); + let provider = FFI_TableFunction::new(Arc::new(func), None); + + PyCapsule::new(py, provider, Some(name)) + } +} +``` + +### Accessing the Calling Session + +Pure-Python UDTFs can opt into receiving the calling +[`SessionContext`][datafusion.context.SessionContext] by registering with +`with_session=True`. The context is passed as a `session` keyword +argument on every invocation. Use it to look up registered tables, +UDFs, or session configuration from inside the callback. + +```python +from datafusion import SessionContext, Table, udtf +from datafusion.context import TableProviderExportable +import pyarrow as pa +import pyarrow.dataset as ds + +@udtf("list_tables", with_session=True) +def list_tables(*, session: SessionContext) -> TableProviderExportable: + names = sorted(session.catalog().schema().names()) + batch = pa.RecordBatch.from_pydict({"name": names}) + return Table(ds.dataset([batch])) + +ctx = SessionContext() +ctx.register_batch("t1", pa.RecordBatch.from_pydict({"x": [1]})) +ctx.register_udtf(list_tables) +ctx.sql("SELECT * FROM list_tables()").show() +``` + +Without `with_session=True`, the callback receives only the positional +expression arguments. The flag is opt-in so existing UDTFs keep working +unchanged. + +The injected `session` is a fresh [`SessionContext`][datafusion.context.SessionContext] +wrapper backed by the same underlying state as the caller, so registries +(tables, UDFs, catalogs) are visible. Registry mutations (e.g. registering +a new table or UDF) propagate to the live session because the registries +are reference-counted and shared. Configuration changes made through the +wrapper (e.g. setting session options) do **not** propagate — the wrapper +holds its own clone of the session config. diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.rst deleted file mode 100644 index 918c2e29e..000000000 --- a/docs/source/user-guide/common-operations/udf-and-udfa.rst +++ /dev/null @@ -1,472 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -User-Defined Functions -====================== - -DataFusion provides powerful expressions and functions, reducing the need for custom Python -functions. However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs). - -Scalar Functions ----------------- - -When writing a user-defined function that can operate on a row by row basis, these are called Scalar -Functions. You can define your own scalar function by calling -:py:func:`~datafusion.user_defined.ScalarUDF.udf` . - -The basic definition of a scalar UDF is a python function that takes one or more -`pyarrow `_ arrays and returns a single array as -output. DataFusion scalar UDFs operate on an entire batch of records at a time, though the -evaluation of those records should be on a row by row basis. In the following example, we compute -if the input array contains null values. - -.. ipython:: python - - import pyarrow - import datafusion - from datafusion import udf, col - - def is_null(array: pyarrow.Array) -> pyarrow.Array: - return array.is_null() - - is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), 'stable') - - ctx = datafusion.SessionContext() - - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, None, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]], name="batch_array") - - df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show() - -In the previous example, we used the fact that pyarrow provides a variety of built in array -functions such as ``is_null()``. There are additional pyarrow -`compute functions `_ available. When possible, -it is highly recommended to use these functions because they can perform computations without doing -any copy operations from the original arrays. This leads to greatly improved performance. - -If you need to perform an operation in python that is not available with the pyarrow compute -functions, you will need to convert the record batch into python values, perform your operation, -and construct an array. This operation of converting the built in data type of the array into a -python object can be one of the slowest operations in DataFusion, so it should be done sparingly. - -The following example performs the same operation as before with ``is_null`` but demonstrates -converting to Python objects to do the evaluation. - -.. ipython:: python - - import pyarrow - import datafusion - from datafusion import udf, col - - def is_null(array: pyarrow.Array) -> pyarrow.Array: - return pyarrow.array([value.as_py() is None for value in array]) - - is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), 'stable') - - ctx = datafusion.SessionContext() - - batch = pyarrow.RecordBatch.from_arrays( - [pyarrow.array([1, None, 3]), pyarrow.array([4, 5, 6])], - names=["a", "b"], - ) - df = ctx.create_dataframe([[batch]], name="batch_array") - - df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show() - -In this example we passed the PyArrow ``DataType`` when we defined the function -by calling ``udf()``. If you need additional control, such as specifying -metadata or nullability of the input or output, you can instead specify a -PyArrow ``Field``. - -If you need to write a custom function but do not want to incur the performance -cost of converting to Python objects and back, a more advanced approach is to -write Rust based UDFs and to expose them to Python. There is an example in the -`DataFusion blog `_ -describing how to do this. - -When not to use a UDF -^^^^^^^^^^^^^^^^^^^^^ - -A UDF is the right tool when the per-row computation genuinely cannot be -expressed with DataFusion's built-in expressions. It is often the *wrong* -tool for a predicate that *can* be written as an ``Expr`` tree but feels -easier to write as a Python function — for example, a filter that keeps -a row if it matches any one of several rule sets, where each rule set -checks its own combination of columns (the worked example at the end of -this section keeps a row when it matches any one of several brand-specific -rules). Looping over the rules in Python and returning a boolean per row -reads naturally and is tempting to wrap in a UDF, but a UDF is opaque to -the optimizer: filters expressed as UDFs lose several rewrites that the -engine applies to filters built from native expressions. The most visible -of these is **predicate pushdown into the table provider**: a native -predicate can be handed to the source so it skips data before it is read, -while a UDF predicate cannot. The example below uses Parquet, where -pushdown prunes whole row groups using the min/max statistics in the -footer, but the same mechanism applies to any table provider that -advertises filter support — including custom providers. - -The following example writes a small Parquet file, then filters it two -ways: first with a native expression, then with a UDF that computes the -same result. The filter itself is simple on purpose so we can compare -the plans side by side. - -.. ipython:: python - - import tempfile, os - import pyarrow as pa - import pyarrow.parquet as pq - from datafusion import SessionContext, col, lit, udf - - tmpdir = tempfile.mkdtemp() - parquet_path = os.path.join(tmpdir, "items.parquet") - pq.write_table( - pa.table({ - "id": list(range(100)), - "brand": ["A", "B", "C", "D"] * 25, - "qty": [i * 10 for i in range(100)], - }), - parquet_path, - ) - - ctx = SessionContext() - items = ctx.read_parquet(parquet_path) - -**Native-expression predicate.** The filter is a plain boolean tree -over column references and literals, so the optimizer can analyze it: - -.. ipython:: python - - native_filtered = items.filter( - (col("brand") == lit("A")) & (col("qty") >= lit(150)) - ) - print(native_filtered.execution_plan().display_indent()) - -Notice the ``DataSourceExec`` line. It carries three annotations the -optimizer computed from the predicate: - -- ``predicate=brand@1 = A AND qty@2 >= 150`` — the filter is pushed - into the Parquet scan itself, so the scan only reads matching rows. -- ``pruning_predicate=... brand_min@0 <= A AND A <= brand_max@1 ... - qty_max@4 >= 150`` — the scan prunes whole row groups by consulting - the Parquet min/max statistics in the footer *before* reading any - column data. -- ``required_guarantees=[brand in (A)]`` — the scan uses this when a - bloom filter or dictionary is available to skip pages. - -**UDF predicate.** Now wrap the same logic in a Python UDF: - -.. ipython:: python - - def brand_qty_filter(brand_arr: pa.Array, qty_arr: pa.Array) -> pa.Array: - return pa.array([ - b.as_py() == "A" and q.as_py() >= 150 - for b, q in zip(brand_arr, qty_arr) - ]) - - pred_udf = udf( - brand_qty_filter, [pa.string(), pa.int64()], pa.bool_(), "stable", - ) - udf_filtered = items.filter(pred_udf(col("brand"), col("qty"))) - print(udf_filtered.execution_plan().display_indent()) - -The ``DataSourceExec`` now carries only ``predicate=brand_qty_filter(...)``. -There is no ``pruning_predicate`` and no ``required_guarantees``: the -scan has to materialize every row group and hand each row to the -Python callback just to decide whether to keep it. - -At small scale the cost difference is invisible; on a Parquet file with -many row groups, or data whose min/max statistics line up well with -the predicate, the native form can skip most of the file. The UDF form -reads all of it. - -**Takeaway.** Reach for a UDF when the per-row computation is genuinely -not expressible as a tree of built-in functions (custom numerical work, -external lookups, complex business rules). When it *is* expressible — -even if the native form is a little more verbose — build the ``Expr`` -tree directly so the optimizer can see through it. For disjunctive -predicates the idiom is to produce one clause per bucket and combine -them with ``|``: - -.. code-block:: python - - from functools import reduce - from operator import or_ - from datafusion import col, lit, functions as f - - buckets = { - "Brand#12": {"containers": ["SM CASE", "SM BOX"], "min_qty": 1, "max_size": 5}, - "Brand#23": {"containers": ["MED BAG", "MED BOX"], "min_qty": 10, "max_size": 10}, - } - - def bucket_clause(brand, spec): - return ( - (col("brand") == lit(brand)) - & f.in_list(col("container"), [lit(c) for c in spec["containers"]]) - & (col("quantity") >= lit(spec["min_qty"])) - & (col("quantity") <= lit(spec["min_qty"] + 10)) - & (col("size") >= lit(1)) - & (col("size") <= lit(spec["max_size"])) - ) - - predicate = reduce(or_, (bucket_clause(b, s) for b, s in buckets.items())) - df = df.filter(predicate) - -Aggregate Functions -------------------- - -The :py:func:`~datafusion.user_defined.AggregateUDF.udaf` function allows you to define User-Defined -Aggregate Functions (UDAFs). To use this you must implement an -:py:class:`~datafusion.user_defined.Accumulator` that determines how the aggregation is performed. - -When defining a UDAF there are four methods you need to implement. The ``update`` function takes the -array(s) of input and updates the internal state of the accumulator. You should define this function -to have as many input arguments as you will pass when calling the UDAF. Since aggregation may be -split into multiple batches, we must have a method to combine multiple batches. For this, we have -two functions, ``state`` and ``merge``. ``state`` will return an array of scalar values that contain -the current state of a single batch accumulation. Then we must ``merge`` the results of these -different states. Finally ``evaluate`` is the call that will return the final result after the -``merge`` is complete. - -In the following example we want to define a custom aggregate function that will return the -difference between the sum of two columns. The state can be represented by a single value and we can -also see how the inputs to ``update`` and ``merge`` differ. - -.. code-block:: python - - import pyarrow as pa - import pyarrow.compute - import datafusion - from datafusion import col, udaf, Accumulator - from typing import List - - class MyAccumulator(Accumulator): - """ - Interface of a user-defined accumulation. - """ - def __init__(self): - self._sum = 0.0 - - def update(self, values_a: pa.Array, values_b: pa.Array) -> None: - self._sum = self._sum + pyarrow.compute.sum(values_a).as_py() - pyarrow.compute.sum(values_b).as_py() - - def merge(self, states: list[pa.Array]) -> None: - self._sum = self._sum + pyarrow.compute.sum(states[0]).as_py() - - def state(self) -> list[pa.Scalar]: - return [pyarrow.scalar(self._sum)] - - def evaluate(self) -> pa.Scalar: - return pyarrow.scalar(self._sum) - - ctx = datafusion.SessionContext() - df = ctx.from_pydict( - { - "a": [4, 5, 6], - "b": [1, 2, 3], - } - ) - - my_udaf = udaf(MyAccumulator, [pa.float64(), pa.float64()], pa.float64(), [pa.float64()], 'stable') - - df.aggregate([], [my_udaf(col("a"), col("b")).alias("col_diff")]) - -FAQ -^^^ - -**How do I return a list from a UDAF?** - -Both the ``evaluate`` and the ``state`` functions expect to return scalar values. -If you wish to return a list array as a scalar value, the best practice is to -wrap the values in a ``pyarrow.Scalar`` object. For example, you can return a -timestamp list with ``pa.scalar([...], type=pa.list_(pa.timestamp("ms")))`` and -register the appropriate return or state types as -``return_type=pa.list_(pa.timestamp("ms"))`` and -``state_type=[pa.list_(pa.timestamp("ms"))]``, respectively. - -As of DataFusion 52.0.0 , you can pass return any Python object, including a -PyArrow array, as the return value(s) for these functions and DataFusion will -attempt to create a scalar type from the value. DataFusion has been tested to -convert PyArrow, nanoarrow, and arro3 objects as well as primitive data types -like integers, strings, and so on. - -Window Functions ----------------- - -To implement a User-Defined Window Function (UDWF) you must call the -:py:func:`~datafusion.user_defined.WindowUDF.udwf` function using a class that implements the abstract -class :py:class:`~datafusion.user_defined.WindowEvaluator`. - -There are three methods of evaluation of UDWFs. - -- ``evaluate`` is the simplest case, where you are given an array and are expected to calculate the - value for a single row of that array. This is the simplest case, but also the least performant. -- ``evaluate_all`` computes the values for all rows for an input array at a single time. -- ``evaluate_all_with_rank`` computes the values for all rows, but you only have the rank - information for the rows. - -Which methods you implement are based upon which of these options are set. - -.. list-table:: - :header-rows: 1 - - * - ``uses_window_frame`` - - ``supports_bounded_execution`` - - ``include_rank`` - - function_to_implement - * - False (default) - - False (default) - - False (default) - - ``evaluate_all`` - * - False - - True - - False - - ``evaluate`` - * - False - - True - - False - - ``evaluate_all_with_rank`` - * - True - - True/False - - True/False - - ``evaluate`` - -UDWF options -^^^^^^^^^^^^ - -When you define your UDWF you can override the functions that return these values. They will -determine which evaluate functions are called. - -- ``uses_window_frame`` is set for functions that compute based on the specified window frame. If - your function depends upon the specified frame, set this to ``True``. -- ``supports_bounded_execution`` specifies if your function can be incrementally computed. -- ``include_rank`` is set to ``True`` for window functions that can be computed only using the rank - information. - - -.. code-block:: python - - import pyarrow as pa - from datafusion import udwf, col, SessionContext - from datafusion.user_defined import WindowEvaluator - - class ExponentialSmooth(WindowEvaluator): - def __init__(self, alpha: float) -> None: - self.alpha = alpha - - def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: - results = [] - curr_value = 0.0 - values = values[0] - for idx in range(num_rows): - if idx == 0: - curr_value = values[idx].as_py() - else: - curr_value = values[idx].as_py() * self.alpha + curr_value * ( - 1.0 - self.alpha - ) - results.append(curr_value) - - return pa.array(results) - - exp_smooth = udwf( - ExponentialSmooth(0.9), - pa.float64(), - pa.float64(), - volatility="immutable", - ) - - ctx = SessionContext() - - df = ctx.from_pydict({ - "a": [1.0, 2.1, 2.9, 4.0, 5.1, 6.0, 6.9, 8.0] - }) - - df.select("a", exp_smooth(col("a")).alias("smooth_a")).show() - -Table Functions ---------------- - -User Defined Table Functions are slightly different than the other functions -described here. These functions take any number of `Expr` arguments, but only -literal expressions are supported. Table functions must return a Table -Provider as described in the ref:`_io_custom_table_provider` page. - -Once you have a table function, you can register it with the session context -by using :py:func:`datafusion.context.SessionContext.register_udtf`. - -There are examples of both rust backed and python based table functions in the -examples folder of the repository. If you have a rust backed table function -that you wish to expose via PyO3, you need to expose it as a ``PyCapsule``. - -.. code-block:: rust - - #[pymethods] - impl MyTableFunction { - fn __datafusion_table_function__<'py>( - &self, - py: Python<'py>, - ) -> PyResult> { - let name = cr"datafusion_table_function".into(); - - let func = self.clone(); - let provider = FFI_TableFunction::new(Arc::new(func), None); - - PyCapsule::new(py, provider, Some(name)) - } - } - -Accessing the Calling Session -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Pure-Python UDTFs can opt into receiving the calling -:py:class:`~datafusion.SessionContext` by registering with -``with_session=True``. The context is passed as a ``session`` keyword -argument on every invocation. Use it to look up registered tables, -UDFs, or session configuration from inside the callback. - -.. code-block:: python - - from datafusion import SessionContext, Table, udtf - from datafusion.context import TableProviderExportable - import pyarrow as pa - import pyarrow.dataset as ds - - @udtf("list_tables", with_session=True) - def list_tables(*, session: SessionContext) -> TableProviderExportable: - names = sorted(session.catalog().schema().names()) - batch = pa.RecordBatch.from_pydict({"name": names}) - return Table(ds.dataset([batch])) - - ctx = SessionContext() - ctx.register_batch("t1", pa.RecordBatch.from_pydict({"x": [1]})) - ctx.register_udtf(list_tables) - ctx.sql("SELECT * FROM list_tables()").show() - -Without ``with_session=True``, the callback receives only the positional -expression arguments. The flag is opt-in so existing UDTFs keep working -unchanged. - -The injected ``session`` is a fresh :py:class:`~datafusion.SessionContext` -wrapper backed by the same underlying state as the caller, so registries -(tables, UDFs, catalogs) are visible. Registry mutations (e.g. registering -a new table or UDF) propagate to the live session because the registries -are reference-counted and shared. Configuration changes made through the -wrapper (e.g. setting session options) do **not** propagate — the wrapper -holds its own clone of the session config. diff --git a/docs/source/user-guide/common-operations/views.md b/docs/source/user-guide/common-operations/views.md new file mode 100644 index 000000000..0522f8f1f --- /dev/null +++ b/docs/source/user-guide/common-operations/views.md @@ -0,0 +1,58 @@ + + +# Registering Views + +You can use the context's [`register_view`][datafusion.context.SessionContext.register_view] method to register a DataFrame as a view + +```python +from datafusion import SessionContext, col, literal + +# Create a DataFusion context +ctx = SessionContext() + +# Create sample data +data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} + +# Create a DataFrame from the dictionary +df = ctx.from_pydict(data, "my_table") + +# Filter the DataFrame (for example, keep rows where a > 2) +df_filtered = df.filter(col("a") > literal(2)) + +# Register the dataframe as a view with the context +ctx.register_view("view1", df_filtered) + +# Now run a SQL query against the registered view +df_view = ctx.sql("SELECT * FROM view1") + +# Collect the results +results = df_view.collect() + +# Convert results to a list of dictionaries for display +result_dicts = [batch.to_pydict() for batch in results] + +print(result_dicts) +``` + +This will output: + +```python +[{'a': [3, 4, 5], 'b': [30, 40, 50]}] +``` diff --git a/docs/source/user-guide/common-operations/views.rst b/docs/source/user-guide/common-operations/views.rst deleted file mode 100644 index df11e3abe..000000000 --- a/docs/source/user-guide/common-operations/views.rst +++ /dev/null @@ -1,58 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -====================== -Registering Views -====================== - -You can use the context's ``register_view`` method to register a DataFrame as a view - -.. code-block:: python - - from datafusion import SessionContext, col, literal - - # Create a DataFusion context - ctx = SessionContext() - - # Create sample data - data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]} - - # Create a DataFrame from the dictionary - df = ctx.from_pydict(data, "my_table") - - # Filter the DataFrame (for example, keep rows where a > 2) - df_filtered = df.filter(col("a") > literal(2)) - - # Register the dataframe as a view with the context - ctx.register_view("view1", df_filtered) - - # Now run a SQL query against the registered view - df_view = ctx.sql("SELECT * FROM view1") - - # Collect the results - results = df_view.collect() - - # Convert results to a list of dictionaries for display - result_dicts = [batch.to_pydict() for batch in results] - - print(result_dicts) - -This will output: - -.. code-block:: python - - [{'a': [3, 4, 5], 'b': [30, 40, 50]}] diff --git a/docs/source/user-guide/common-operations/windows.md b/docs/source/user-guide/common-operations/windows.md new file mode 100644 index 000000000..125c9f120 --- /dev/null +++ b/docs/source/user-guide/common-operations/windows.md @@ -0,0 +1,227 @@ + + + +# Window Functions + +In this section you will learn about window functions. A window function utilizes values from one or +multiple rows to produce a result for each individual row, unlike an aggregate function that +provides a single value for multiple rows. + +The window functions are available in the [`functions`][datafusion.functions] module. + +We'll use the pokemon dataset (from Ritchie Vink) in the following examples. + +```python exec="1" source="material-block" session="windows" +ctx = SessionContext() +df = ctx.read_csv("pokemon.csv") +``` + + +Here is an example that shows how you can compare each pokemon's speed to the speed of the +previous row in the DataFrame. + +```python exec="1" source="material-block" result="text" session="windows" +df.select(col('"Name"'), col('"Speed"'), f.lag(col('"Speed"')).alias("Previous Speed")).show() +``` + + +## Setting Parameters + +### Ordering + +You can control the order in which rows are processed by window functions by providing +a list of `order_by` functions for the `order_by` parameter. + +```python exec="1" source="material-block" result="text" session="windows" +df.select( + col('"Name"'), + col('"Attack"'), + col('"Type 1"'), + f.rank( + partition_by=[col('"Type 1"')], + order_by=[col('"Attack"').sort(ascending=True)], + ).alias("rank"), +).sort(col('"Type 1"'), col('"Attack"')).show() +``` + + +### Partitions + +A window function can take a list of `partition_by` columns similar to an +[Aggregation Function](aggregations.md). This will cause the window values to be evaluated +independently for each of the partitions. In the example above, we found the rank of each +Pokemon per `Type 1` partitions. We can see the first couple of each partition if we do +the following: + +```python exec="1" source="material-block" result="text" session="windows" +df.select( + col('"Name"'), + col('"Attack"'), + col('"Type 1"'), + f.rank( + partition_by=[col('"Type 1"')], + order_by=[col('"Attack"').sort(ascending=True)], + ).alias("rank"), +).filter(col("rank") < lit(3)).sort(col('"Type 1"'), col("rank")).show() +``` + + +### Window Frame + +When using aggregate functions, the Window Frame of defines the rows over which it operates. +If you do not specify a Window Frame, the frame will be set depending on the following +criteria. + +- If an `order_by` clause is set, the default window frame is defined as the rows between + unbounded preceding and the current row. +- If an `order_by` is not set, the default frame is defined as the rows between unbounded + and unbounded following (the entire partition). + +Window Frames are defined by three parameters: unit type, starting bound, and ending bound. + +The unit types available are: + +- Rows: The starting and ending boundaries are defined by the number of rows relative to the + current row. +- Range: When using Range, the `order_by` clause must have exactly one term. The boundaries + are defined bow how close the rows are to the value of the expression in the `order_by` + parameter. +- Groups: A "group" is the set of all rows that have equivalent values for all terms in the + `order_by` clause. + +In this example we perform a "rolling average" of the speed of the current Pokemon and the +two preceding rows. + +```python exec="1" source="material-block" result="text" session="windows" +from datafusion.expr import Window, WindowFrame + +df.select( + col('"Name"'), + col('"Speed"'), + f.avg(col('"Speed"')) + .over(Window(window_frame=WindowFrame("rows", 2, 0), order_by=[col('"Speed"')])) + .alias("Previous Speed"), +).show() +``` + + +### Null Treatment + +When using aggregate functions as window functions, it is often useful to specify how null values +should be treated. In order to do this you need to use the builder function. In future releases +we expect this to be simplified in the interface. + +One common usage for handling nulls is the case where you want to find the last value up to the +current row. In the following example we demonstrate how setting the null treatment to ignore +nulls will fill in with the value of the most recent non-null row. To do this, we also will set +the window frame so that we only process up to the current row. + +In this example, we filter down to one specific type of Pokemon that does have some entries in +it's `Type 2` column that are null. + +```python exec="1" source="material-block" result="text" session="windows" +from datafusion.common import NullTreatment + +df.filter(col('"Type 1"') == lit("Bug")).select( + '"Name"', + '"Type 2"', + f.last_value(col('"Type 2"')) + .over( + Window( + window_frame=WindowFrame("rows", None, 0), + order_by=[col('"Speed"')], + null_treatment=NullTreatment.IGNORE_NULLS, + ) + ) + .alias("last_wo_null"), + f.last_value(col('"Type 2"')) + .over( + Window( + window_frame=WindowFrame("rows", None, 0), + order_by=[col('"Speed"')], + null_treatment=NullTreatment.RESPECT_NULLS, + ) + ) + .alias("last_with_null"), +).show() +``` + + +## Aggregate Functions + +You can use any [Aggregation Function](aggregations.md) as a window function. Here +is an example that shows how to compare each pokemons’s attack power with the average attack +power in its `"Type 1"` using the [`avg`][datafusion.functions.avg] function. + +```python exec="1" source="material-block" result="text" session="windows" +df.select( + col('"Name"'), + col('"Attack"'), + col('"Type 1"'), + f.avg(col('"Attack"')) + .over( + Window( + window_frame=WindowFrame("rows", None, None), + partition_by=[col('"Type 1"')], + ) + ) + .alias("Average Attack"), +).show() +``` + + +## Available Functions + +The possible window functions are: + +1. Rank Functions + : - [`rank`][datafusion.functions.rank] + - [`dense_rank`][datafusion.functions.dense_rank] + - [`ntile`][datafusion.functions.ntile] + - [`row_number`][datafusion.functions.row_number] +2. Analytical Functions + : - [`cume_dist`][datafusion.functions.cume_dist] + - [`percent_rank`][datafusion.functions.percent_rank] + - [`lag`][datafusion.functions.lag] + - [`lead`][datafusion.functions.lead] +3. Aggregate Functions + : - All [Aggregation Functions](aggregations.md) can be used as window functions. + +## User-Defined Window Functions + +You can ship custom window functions to the engine by subclassing +[`WindowEvaluator`][datafusion.user_defined.WindowEvaluator] and registering it +via [`udwf`][datafusion.user_defined.udwf]. See [`user_defined`](../../reference/datafusion/user_defined.md) +for the evaluator interface and worked examples. + +
+

Note

+ +Serialization + +
+ + Python window UDFs travel inline inside pickled or + [`to_bytes`][datafusion.expr.Expr.to_bytes]-serialized expressions — + the evaluator class is captured by value via [`cloudpickle`][cloudpickle], so + worker processes do not need to pre-register the UDF. Any names the + evaluator resolves via `import` are captured **by reference** and + must be importable on the receiving worker. See + [`ipc`][datafusion.ipc] for the full IPC model and security caveats. diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.rst deleted file mode 100644 index 127f691b5..000000000 --- a/docs/source/user-guide/common-operations/windows.rst +++ /dev/null @@ -1,233 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _window_functions: - -Window Functions -================ - -In this section you will learn about window functions. A window function utilizes values from one or -multiple rows to produce a result for each individual row, unlike an aggregate function that -provides a single value for multiple rows. - -The window functions are available in the :py:mod:`~datafusion.functions` module. - -We'll use the pokemon dataset (from Ritchie Vink) in the following examples. - -.. ipython:: python - - from datafusion import SessionContext - from datafusion import col, lit - from datafusion import functions as f - - ctx = SessionContext() - df = ctx.read_csv("pokemon.csv") - -Here is an example that shows how you can compare each pokemon's speed to the speed of the -previous row in the DataFrame. - -.. ipython:: python - - df.select( - col('"Name"'), - col('"Speed"'), - f.lag(col('"Speed"')).alias("Previous Speed") - ) - -Setting Parameters ------------------- - - -Ordering -^^^^^^^^ - -You can control the order in which rows are processed by window functions by providing -a list of ``order_by`` functions for the ``order_by`` parameter. - -.. ipython:: python - - df.select( - col('"Name"'), - col('"Attack"'), - col('"Type 1"'), - f.rank( - partition_by=[col('"Type 1"')], - order_by=[col('"Attack"').sort(ascending=True)], - ).alias("rank"), - ).sort(col('"Type 1"'), col('"Attack"')) - -Partitions -^^^^^^^^^^ - -A window function can take a list of ``partition_by`` columns similar to an -:ref:`Aggregation Function`. This will cause the window values to be evaluated -independently for each of the partitions. In the example above, we found the rank of each -Pokemon per ``Type 1`` partitions. We can see the first couple of each partition if we do -the following: - -.. ipython:: python - - df.select( - col('"Name"'), - col('"Attack"'), - col('"Type 1"'), - f.rank( - partition_by=[col('"Type 1"')], - order_by=[col('"Attack"').sort(ascending=True)], - ).alias("rank"), - ).filter(col("rank") < lit(3)).sort(col('"Type 1"'), col("rank")) - -Window Frame -^^^^^^^^^^^^ - -When using aggregate functions, the Window Frame of defines the rows over which it operates. -If you do not specify a Window Frame, the frame will be set depending on the following -criteria. - -* If an ``order_by`` clause is set, the default window frame is defined as the rows between - unbounded preceding and the current row. -* If an ``order_by`` is not set, the default frame is defined as the rows between unbounded - and unbounded following (the entire partition). - -Window Frames are defined by three parameters: unit type, starting bound, and ending bound. - -The unit types available are: - -* Rows: The starting and ending boundaries are defined by the number of rows relative to the - current row. -* Range: When using Range, the ``order_by`` clause must have exactly one term. The boundaries - are defined bow how close the rows are to the value of the expression in the ``order_by`` - parameter. -* Groups: A "group" is the set of all rows that have equivalent values for all terms in the - ``order_by`` clause. - -In this example we perform a "rolling average" of the speed of the current Pokemon and the -two preceding rows. - -.. ipython:: python - - from datafusion.expr import Window, WindowFrame - - df.select( - col('"Name"'), - col('"Speed"'), - f.avg(col('"Speed"')) - .over(Window(window_frame=WindowFrame("rows", 2, 0), order_by=[col('"Speed"')])) - .alias("Previous Speed"), - ) - -Null Treatment -^^^^^^^^^^^^^^ - -When using aggregate functions as window functions, it is often useful to specify how null values -should be treated. In order to do this you need to use the builder function. In future releases -we expect this to be simplified in the interface. - -One common usage for handling nulls is the case where you want to find the last value up to the -current row. In the following example we demonstrate how setting the null treatment to ignore -nulls will fill in with the value of the most recent non-null row. To do this, we also will set -the window frame so that we only process up to the current row. - -In this example, we filter down to one specific type of Pokemon that does have some entries in -it's ``Type 2`` column that are null. - -.. ipython:: python - - from datafusion.common import NullTreatment - - df.filter(col('"Type 1"') == lit("Bug")).select( - '"Name"', - '"Type 2"', - f.last_value(col('"Type 2"')) - .over( - Window( - window_frame=WindowFrame("rows", None, 0), - order_by=[col('"Speed"')], - null_treatment=NullTreatment.IGNORE_NULLS, - ) - ) - .alias("last_wo_null"), - f.last_value(col('"Type 2"')) - .over( - Window( - window_frame=WindowFrame("rows", None, 0), - order_by=[col('"Speed"')], - null_treatment=NullTreatment.RESPECT_NULLS, - ) - ) - .alias("last_with_null"), - ) - -Aggregate Functions -------------------- - -You can use any :ref:`Aggregation Function` as a window function. Here -is an example that shows how to compare each pokemons’s attack power with the average attack -power in its ``"Type 1"`` using the :py:func:`datafusion.functions.avg` function. - -.. ipython:: python - :okwarning: - - df.select( - col('"Name"'), - col('"Attack"'), - col('"Type 1"'), - f.avg(col('"Attack"')).over( - Window( - window_frame=WindowFrame("rows", None, None), - partition_by=[col('"Type 1"')], - ) - ).alias("Average Attack"), - ) - -Available Functions -------------------- - -The possible window functions are: - -1. Rank Functions - - :py:func:`datafusion.functions.rank` - - :py:func:`datafusion.functions.dense_rank` - - :py:func:`datafusion.functions.ntile` - - :py:func:`datafusion.functions.row_number` - -2. Analytical Functions - - :py:func:`datafusion.functions.cume_dist` - - :py:func:`datafusion.functions.percent_rank` - - :py:func:`datafusion.functions.lag` - - :py:func:`datafusion.functions.lead` - -3. Aggregate Functions - - All :ref:`Aggregation Functions` can be used as window functions. - -User-Defined Window Functions ------------------------------ - -You can ship custom window functions to the engine by subclassing -:py:class:`~datafusion.user_defined.WindowEvaluator` and registering it -via :py:func:`~datafusion.udwf`. See :py:mod:`datafusion.user_defined` -for the evaluator interface and worked examples. - -.. note:: Serialization - - Python window UDFs travel inline inside pickled or - :py:meth:`~datafusion.expr.Expr.to_bytes`-serialized expressions — - the evaluator class is captured by value via :mod:`cloudpickle`, so - worker processes do not need to pre-register the UDF. Any names the - evaluator resolves via ``import`` are captured **by reference** and - must be importable on the receiving worker. See - :py:mod:`datafusion.ipc` for the full IPC model and security caveats. diff --git a/docs/source/user-guide/concepts.md b/docs/source/user-guide/concepts.md new file mode 100644 index 000000000..2cb896819 --- /dev/null +++ b/docs/source/user-guide/concepts.md @@ -0,0 +1,96 @@ + + + +# Concepts + +In this section, we will cover a basic example to introduce a few key concepts. We will use the +2021 Yellow Taxi Trip Records ([download](https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet)), +from the [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page). + +```python exec="1" source="material-block" result="text" session="concepts" +ctx = SessionContext() + +df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") + +df = df.select( + "trip_distance", + col("total_amount").alias("total"), + (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias( + "tip_percent" + ), +) + +df.show() +``` + + +## Session Context + +The first statement group creates a [`SessionContext`][datafusion.context.SessionContext]. + +```python +# create a context +ctx = datafusion.SessionContext() +``` + +A Session Context is the main interface for executing queries with DataFusion. It maintains the state +of the connection between a user and an instance of the DataFusion engine. Additionally it provides +the following functionality: + +- Create a DataFrame from a data source. +- Register a data source as a table that can be referenced from a SQL query. +- Execute a SQL query + +## DataFrame + +The second statement group creates a [`DataFrame`][datafusion.dataframe.DataFrame], + +```python +# Create a DataFrame from a file +df = ctx.read_parquet("yellow_tripdata_2021-01.parquet") +``` + +A DataFrame refers to a (logical) set of rows that share the same column names, similar to a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html). +DataFrames are typically created by calling a method on [`SessionContext`][datafusion.context.SessionContext], such as [`read_csv`][datafusion.context.SessionContext.read_csv], and can then be modified by +calling the transformation methods, such as [`filter`][datafusion.dataframe.DataFrame.filter], [`select`][datafusion.dataframe.DataFrame.select], [`aggregate`][datafusion.dataframe.DataFrame.aggregate], +and [`limit`][datafusion.dataframe.DataFrame.limit] to build up a query definition. + +For more details on working with DataFrames, including visualization options and conversion to other formats, see [dataframe/index](dataframe/index.md). + +## Expressions + +The third statement uses [Expressions](common-operations/expressions.md) to build up a query definition. You can find +explanations for what the functions below do in the user documentation for +[`col`][datafusion.col.col], [`lit`][datafusion.lit], [`round`][datafusion.functions.round], +and [`alias`][datafusion.expr.Expr.alias]. + +```python +df = df.select( + "trip_distance", + col("total_amount").alias("total"), + (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"), +) +``` + +Finally the [`show`][datafusion.dataframe.DataFrame.show] method converts the logical plan +represented by the DataFrame into a physical plan and execute it, collecting all results and +displaying them to the user. It is important to note that DataFusion performs lazy evaluation +of the DataFrame. Until you call a method such as [`show`][datafusion.dataframe.DataFrame.show] +or [`collect`][datafusion.dataframe.DataFrame.collect], DataFusion will not perform the query. diff --git a/docs/source/user-guide/configuration.md b/docs/source/user-guide/configuration.md new file mode 100644 index 000000000..09dd70f6b --- /dev/null +++ b/docs/source/user-guide/configuration.md @@ -0,0 +1,184 @@ + + + +# Configuration + +Let's look at how we can configure DataFusion. When creating a [`SessionContext`][datafusion.context.SessionContext], you can pass in +a [`SessionConfig`][datafusion.context.SessionConfig] and [`RuntimeEnvBuilder`][datafusion.context.RuntimeEnvBuilder] object. These two cover a wide range of options. + +```python +from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext + +# create a session context with default settings +ctx = SessionContext() +print(ctx) + +# create a session context with explicit runtime and config settings +runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) +config = ( + SessionConfig() + .with_create_default_catalog_and_schema(True) + .with_default_catalog_and_schema("foo", "bar") + .with_target_partitions(8) + .with_information_schema(True) + .with_repartition_joins(False) + .with_repartition_aggregations(False) + .with_repartition_windows(False) + .with_parquet_pruning(False) + .set("datafusion.execution.parquet.pushdown_filters", "true") +) +ctx = SessionContext(config, runtime) +print(ctx) +``` + +## Maximizing CPU Usage + +DataFusion uses partitions to parallelize work. For small queries the +default configuration (number of CPU cores) is often sufficient, but to +fully utilize available hardware you can tune how many partitions are +created and when DataFusion will repartition data automatically. + +Configure a `SessionContext` with a higher partition count: + +```python +from datafusion import SessionConfig, SessionContext + +# allow up to 16 concurrent partitions +config = SessionConfig().with_target_partitions(16) +ctx = SessionContext(config) +``` + +Automatic repartitioning for joins, aggregations, window functions and +other operations can be enabled to increase parallelism: + +```python +config = ( + SessionConfig() + .with_target_partitions(16) + .with_repartition_joins(True) + .with_repartition_aggregations(True) + .with_repartition_windows(True) +) +``` + +Manual repartitioning is available on DataFrames when you need precise +control: + +```python +from datafusion import col + +df = ctx.read_parquet("data.parquet") + +# Evenly divide into 16 partitions +df = df.repartition(16) + +# Or partition by the hash of a column +df = df.repartition_by_hash(col("a"), num=16) + +result = df.collect() +``` + +### Benchmark Example + +The repository includes a benchmark script that demonstrates how to maximize CPU usage +with DataFusion. The `benchmarks/max_cpu_usage.py` script shows a practical example +of configuring DataFusion for optimal parallelism. + +You can run the benchmark script to see the impact of different configuration settings: + +```bash +# Run with default settings (uses all CPU cores) +python benchmarks/max_cpu_usage.py + +# Run with specific number of rows and partitions +python benchmarks/max_cpu_usage.py --rows 5000000 --partitions 16 + +# See all available options +python benchmarks/max_cpu_usage.py --help +``` + +Here's an example showing the performance difference between single and multiple partitions: + +```bash +# Single partition - slower processing +$ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 1 +Processed 10000000 rows using 1 partitions in 0.107s + +# Multiple partitions - faster processing +$ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 10 +Processed 10000000 rows using 10 partitions in 0.038s +``` + +This example demonstrates nearly 3x performance improvement (0.107s vs 0.038s) when using +10 partitions instead of 1, showcasing how proper partitioning can significantly improve +CPU utilization and query performance. + +The script demonstrates several key optimization techniques: + +1. **Higher target partition count**: Uses `with_target_partitions()` to set the number of concurrent partitions +2. **Automatic repartitioning**: Enables repartitioning for joins, aggregations, and window functions +3. **Manual repartitioning**: Uses `repartition()` to ensure all partitions are utilized +4. **CPU-intensive operations**: Performs aggregations that can benefit from parallelization + +The benchmark creates synthetic data and measures the time taken to perform a sum aggregation +across the specified number of partitions. This helps you understand how partition configuration +affects performance on your specific hardware. + +#### Important Considerations + +The provided benchmark script demonstrates partitioning concepts using synthetic in-memory data +and simple aggregation operations. While useful for understanding basic configuration principles, +actual performance in production environments may vary significantly based on numerous factors: + +**Data Sources and I/O Characteristics:** + +- **Table providers**: Performance differs greatly between Parquet files, CSV files, databases, and cloud storage +- **Storage type**: Local SSD, network-attached storage, and cloud storage have vastly different characteristics +- **Network latency**: Remote data sources introduce additional latency considerations +- **File sizes and distribution**: Large files may benefit differently from partitioning than many small files + +**Query and Workload Characteristics:** + +- **Operation complexity**: Simple aggregations versus complex joins, window functions, or nested queries +- **Data distribution**: Skewed data may not partition evenly, affecting parallel efficiency +- **Memory usage**: Large datasets may require different memory management strategies +- **Concurrent workloads**: Multiple queries running simultaneously affect resource allocation + +**Hardware and Environment Factors:** + +- **CPU architecture**: Different processors have varying parallel processing capabilities +- **Available memory**: Limited RAM may require different optimization strategies +- **System load**: Other applications competing for resources affect DataFusion performance + +**Recommendations for Production Use:** + +To optimize DataFusion for your specific use case, it is strongly recommended to: + +1. **Create custom benchmarks** using your actual data sources, formats, and query patterns +2. **Test with representative data volumes** that match your production workloads +3. **Measure end-to-end performance** including data loading, processing, and result handling +4. **Evaluate different configuration combinations** for your specific hardware and workload +5. **Monitor resource utilization** (CPU, memory, I/O) to identify bottlenecks in your environment + +This approach will provide more accurate insights into how DataFusion configuration options +will impact your particular applications and infrastructure. + +For more information about available [`SessionConfig`][datafusion.context.SessionConfig] options, see the [rust DataFusion Configuration guide](https://arrow.apache.org/datafusion/user-guide/configs.html), +and about `RuntimeEnvBuilder` options in the rust [online API documentation](https://docs.rs/datafusion/latest/datafusion/execution/runtime_env/struct.RuntimeEnvBuilder.html). diff --git a/docs/source/user-guide/configuration.rst b/docs/source/user-guide/configuration.rst deleted file mode 100644 index f8e613cd4..000000000 --- a/docs/source/user-guide/configuration.rst +++ /dev/null @@ -1,188 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _configuration: - -Configuration -============= - -Let's look at how we can configure DataFusion. When creating a :py:class:`~datafusion.context.SessionContext`, you can pass in -a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.context.RuntimeEnvBuilder` object. These two cover a wide range of options. - -.. code-block:: python - - from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext - - # create a session context with default settings - ctx = SessionContext() - print(ctx) - - # create a session context with explicit runtime and config settings - runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000) - config = ( - SessionConfig() - .with_create_default_catalog_and_schema(True) - .with_default_catalog_and_schema("foo", "bar") - .with_target_partitions(8) - .with_information_schema(True) - .with_repartition_joins(False) - .with_repartition_aggregations(False) - .with_repartition_windows(False) - .with_parquet_pruning(False) - .set("datafusion.execution.parquet.pushdown_filters", "true") - ) - ctx = SessionContext(config, runtime) - print(ctx) - -Maximizing CPU Usage --------------------- - -DataFusion uses partitions to parallelize work. For small queries the -default configuration (number of CPU cores) is often sufficient, but to -fully utilize available hardware you can tune how many partitions are -created and when DataFusion will repartition data automatically. - -Configure a ``SessionContext`` with a higher partition count: - -.. code-block:: python - - from datafusion import SessionConfig, SessionContext - - # allow up to 16 concurrent partitions - config = SessionConfig().with_target_partitions(16) - ctx = SessionContext(config) - -Automatic repartitioning for joins, aggregations, window functions and -other operations can be enabled to increase parallelism: - -.. code-block:: python - - config = ( - SessionConfig() - .with_target_partitions(16) - .with_repartition_joins(True) - .with_repartition_aggregations(True) - .with_repartition_windows(True) - ) - -Manual repartitioning is available on DataFrames when you need precise -control: - -.. code-block:: python - - from datafusion import col - - df = ctx.read_parquet("data.parquet") - - # Evenly divide into 16 partitions - df = df.repartition(16) - - # Or partition by the hash of a column - df = df.repartition_by_hash(col("a"), num=16) - - result = df.collect() - - -Benchmark Example -^^^^^^^^^^^^^^^^^ - -The repository includes a benchmark script that demonstrates how to maximize CPU usage -with DataFusion. The :code:`benchmarks/max_cpu_usage.py` script shows a practical example -of configuring DataFusion for optimal parallelism. - -You can run the benchmark script to see the impact of different configuration settings: - -.. code-block:: bash - - # Run with default settings (uses all CPU cores) - python benchmarks/max_cpu_usage.py - - # Run with specific number of rows and partitions - python benchmarks/max_cpu_usage.py --rows 5000000 --partitions 16 - - # See all available options - python benchmarks/max_cpu_usage.py --help - -Here's an example showing the performance difference between single and multiple partitions: - -.. code-block:: bash - - # Single partition - slower processing - $ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 1 - Processed 10000000 rows using 1 partitions in 0.107s - - # Multiple partitions - faster processing - $ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 10 - Processed 10000000 rows using 10 partitions in 0.038s - -This example demonstrates nearly 3x performance improvement (0.107s vs 0.038s) when using -10 partitions instead of 1, showcasing how proper partitioning can significantly improve -CPU utilization and query performance. - -The script demonstrates several key optimization techniques: - -1. **Higher target partition count**: Uses :code:`with_target_partitions()` to set the number of concurrent partitions -2. **Automatic repartitioning**: Enables repartitioning for joins, aggregations, and window functions -3. **Manual repartitioning**: Uses :code:`repartition()` to ensure all partitions are utilized -4. **CPU-intensive operations**: Performs aggregations that can benefit from parallelization - -The benchmark creates synthetic data and measures the time taken to perform a sum aggregation -across the specified number of partitions. This helps you understand how partition configuration -affects performance on your specific hardware. - -Important Considerations -"""""""""""""""""""""""" - -The provided benchmark script demonstrates partitioning concepts using synthetic in-memory data -and simple aggregation operations. While useful for understanding basic configuration principles, -actual performance in production environments may vary significantly based on numerous factors: - -**Data Sources and I/O Characteristics:** - -- **Table providers**: Performance differs greatly between Parquet files, CSV files, databases, and cloud storage -- **Storage type**: Local SSD, network-attached storage, and cloud storage have vastly different characteristics -- **Network latency**: Remote data sources introduce additional latency considerations -- **File sizes and distribution**: Large files may benefit differently from partitioning than many small files - -**Query and Workload Characteristics:** - -- **Operation complexity**: Simple aggregations versus complex joins, window functions, or nested queries -- **Data distribution**: Skewed data may not partition evenly, affecting parallel efficiency -- **Memory usage**: Large datasets may require different memory management strategies -- **Concurrent workloads**: Multiple queries running simultaneously affect resource allocation - -**Hardware and Environment Factors:** - -- **CPU architecture**: Different processors have varying parallel processing capabilities -- **Available memory**: Limited RAM may require different optimization strategies -- **System load**: Other applications competing for resources affect DataFusion performance - -**Recommendations for Production Use:** - -To optimize DataFusion for your specific use case, it is strongly recommended to: - -1. **Create custom benchmarks** using your actual data sources, formats, and query patterns -2. **Test with representative data volumes** that match your production workloads -3. **Measure end-to-end performance** including data loading, processing, and result handling -4. **Evaluate different configuration combinations** for your specific hardware and workload -5. **Monitor resource utilization** (CPU, memory, I/O) to identify bottlenecks in your environment - -This approach will provide more accurate insights into how DataFusion configuration options -will impact your particular applications and infrastructure. - -For more information about available :py:class:`~datafusion.context.SessionConfig` options, see the `rust DataFusion Configuration guide `_, -and about :code:`RuntimeEnvBuilder` options in the rust `online API documentation `_. diff --git a/docs/source/user-guide/data-sources.md b/docs/source/user-guide/data-sources.md new file mode 100644 index 000000000..168d447a0 --- /dev/null +++ b/docs/source/user-guide/data-sources.md @@ -0,0 +1,280 @@ + + + +# Data Sources + +DataFusion provides a wide variety of ways to get data into a DataFrame to perform operations. + +## Local file + +DataFusion has the ability to read from a variety of popular file formats, such as [Parquet](io/parquet.md), +[CSV](io/csv.md), [JSON](io/json.md), and [AVRO](io/avro.md). + +```python exec="1" source="material-block" result="text" session="data-sources" +ctx = SessionContext() +df = ctx.read_csv("pokemon.csv") +df.show() +``` + + +## Create in-memory + +Sometimes it can be convenient to create a small DataFrame from a Python list or dictionary object. +To do this in DataFusion, you can use one of the three functions +[`from_pydict`][datafusion.context.SessionContext.from_pydict], +[`from_pylist`][datafusion.context.SessionContext.from_pylist], or +[`create_dataframe`][datafusion.context.SessionContext.create_dataframe]. + +As their names suggest, [`from_pydict`][datafusion.context.SessionContext.from_pydict] and [`from_pylist`][datafusion.context.SessionContext.from_pylist] will create DataFrames from Python +dictionary and list objects, respectively. [`create_dataframe`][datafusion.context.SessionContext.create_dataframe] assumes you will pass in a list +of list of [PyArrow Record Batches](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html). + +The following three examples all will create identical DataFrames: + +```python exec="1" source="material-block" result="text" session="data-sources" +import pyarrow as pa + +ctx.from_pylist( + [ + {"a": 1, "b": 10.0, "c": "alpha"}, + {"a": 2, "b": 20.0, "c": "beta"}, + {"a": 3, "b": 30.0, "c": "gamma"}, + ] +).show() + +ctx.from_pydict( + { + "a": [1, 2, 3], + "b": [10.0, 20.0, 30.0], + "c": ["alpha", "beta", "gamma"], + } +).show() + +batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2, 3]), + pa.array([10.0, 20.0, 30.0]), + pa.array(["alpha", "beta", "gamma"]), + ], + names=["a", "b", "c"], +) + +ctx.create_dataframe([[batch]]).show() +``` + + +## Object Store + +DataFusion has support for multiple storage options in addition to local files. +The example below requires an appropriate S3 account with access credentials. + +Supported Object Stores are + +- [`AmazonS3`][datafusion.object_store.AmazonS3] +- [`GoogleCloud`][datafusion.object_store.GoogleCloud] +- [`Http`][datafusion.object_store.Http] +- [`LocalFileSystem`][datafusion.object_store.LocalFileSystem] +- [`MicrosoftAzure`][datafusion.object_store.MicrosoftAzure] + +```python +from datafusion.object_store import AmazonS3 + +region = "us-east-1" +bucket_name = "yellow-trips" + +s3 = AmazonS3( + bucket_name=bucket_name, + region=region, + access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), +) + +path = f"s3://{bucket_name}/" +ctx.register_object_store("s3://", s3, None) + +ctx.register_parquet("trips", path) + +ctx.table("trips").show() +``` + +## Other DataFrame Libraries + +DataFusion can import DataFrames directly from other libraries, such as +[Polars](https://pola.rs/) and [Pandas](https://pandas.pydata.org/). +Since DataFusion version 42.0.0, any DataFrame library that supports the Arrow FFI PyCapsule +interface can be imported to DataFusion using the +[`from_arrow`][datafusion.context.SessionContext.from_arrow] function. Older versions of Polars may +not support the arrow interface. In those cases, you can still import via the +[`from_polars`][datafusion.context.SessionContext.from_polars] function. + +```python +import pandas as pd + +data = { "a": [1, 2, 3], "b": [10.0, 20.0, 30.0], "c": ["alpha", "beta", "gamma"] } +pandas_df = pd.DataFrame(data) + +datafusion_df = ctx.from_arrow(pandas_df) +datafusion_df.show() +``` + +```python +import polars as pl +polars_df = pl.DataFrame(data) + +datafusion_df = ctx.from_arrow(polars_df) +datafusion_df.show() +``` + +## Delta Lake + +DataFusion 43.0.0 and later support the ability to register table providers from sources such +as Delta Lake. This will require a recent version of +[deltalake](https://delta-io.github.io/delta-rs/) to provide the required interfaces. + +```python +from deltalake import DeltaTable + +delta_table = DeltaTable("path_to_table") +ctx.register_table("my_delta_table", delta_table) +df = ctx.table("my_delta_table") +df.show() +``` + +On older versions of [`deltalake`](https://delta-io.github.io/delta-rs/) (prior to 0.22) you can use the +[Arrow DataSet](https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html) +interface to import to DataFusion, but this does not support features such as filter push down +which can lead to a significant performance difference. + +```python +from deltalake import DeltaTable + +delta_table = DeltaTable("path_to_table") +ctx.register_dataset("my_delta_table", delta_table.to_pyarrow_dataset()) +df = ctx.table("my_delta_table") +df.show() +``` + +## Apache Iceberg + +DataFusion 45.0.0 and later support the ability to register Apache Iceberg tables as table providers through the Custom Table Provider interface. + +This requires either the [pyiceberg](https://pypi.org/project/pyiceberg/) library (>=0.10.0) or the [pyiceberg-core](https://pypi.org/project/pyiceberg-core/) library (>=0.5.0). + +- The `pyiceberg-core` library exposes Iceberg Rust's implementation of the Custom Table Provider interface as python bindings. +- The `pyiceberg` library utilizes the `pyiceberg-core` python bindings under the hood and provides a native way for Python users to interact with the DataFusion. + +```python +from datafusion import SessionContext +from pyiceberg.catalog import load_catalog +import pyarrow as pa + +# Load catalog and create/load a table +catalog = load_catalog("catalog", type="in-memory") +catalog.create_namespace_if_not_exists("default") + +# Create some sample data +data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]}) +iceberg_table = catalog.create_table("default.test", schema=data.schema) +iceberg_table.append(data) + +# Register the table with DataFusion +ctx = SessionContext() +ctx.register_table_provider("test", iceberg_table) + +# Query the table using DataFusion +ctx.table("test").show() +``` + +Note that the Datafusion integration rely on features from the [Iceberg Rust](https://github.com/apache/iceberg-rust/) implementation instead of the [PyIceberg](https://github.com/apache/iceberg-python/) implementation. +Features that are available in PyIceberg but not yet in Iceberg Rust will not be available when using DataFusion. + +## Custom Table Provider + +You can implement a custom Data Provider in Rust and expose it to DataFusion through the +the interface as describe in the [Custom Table Provider](io/table_provider.md) +section. This is an advanced topic, but a +[user example](https://github.com/apache/datafusion-python/tree/main/examples/datafusion-ffi-example) +is provided in the DataFusion repository. + +# Catalog + +A common technique for organizing tables is using a three level hierarchical approach. DataFusion +supports this form of organizing using the [`Catalog`][datafusion.catalog.Catalog], +[`Schema`][datafusion.catalog.Schema], and [`Table`][datafusion.catalog.Table]. By default, +a [`SessionContext`][datafusion.context.SessionContext] comes with a single Catalog and a single Schema +with the names `datafusion` and `public`, respectively. + +The default implementation uses an in-memory approach to the catalog and schema. We have support +for adding additional in-memory catalogs and schemas. You can access tables registered in a schema +either through the Dataframe API or via sql commands. This can be done like in the following +example: + +```python +import pyarrow as pa +from datafusion.catalog import Catalog, Schema +from datafusion import SessionContext + +ctx = SessionContext() + +my_catalog = Catalog.memory_catalog() +my_schema = Schema.memory_schema() +my_catalog.register_schema('my_schema_name', my_schema) +ctx.register_catalog_provider('my_catalog_name', my_catalog) + +# Create an in-memory table +table = pa.table({ + 'name': ['Bulbasaur', 'Charmander', 'Squirtle'], + 'type': ['Grass', 'Fire', 'Water'], + 'hp': [45, 39, 44], +}) +df = ctx.create_dataframe([table.to_batches()], name='pokemon') + +my_schema.register_table('pokemon', df) + +ctx.sql('SELECT * FROM my_catalog_name.my_schema_name.pokemon').show() +``` + +## User Defined Catalog and Schema + +If the in-memory catalogs are insufficient for your uses, there are two approaches you can take +to implementing a custom catalog and/or schema. In the below discussion, we describe how to +implement these for a Catalog, but the approach to implementing for a Schema is nearly +identical. + +DataFusion supports Catalogs written in either Rust or Python. If you write a Catalog in Rust, +you will need to export it as a Python library via PyO3. There is a complete example of a +catalog implemented this way in the +[examples folder](https://github.com/apache/datafusion-python/tree/main/examples/) +of our repository. Writing catalog providers in Rust provides typically can lead to significant +performance improvements over the Python based approach. + +To implement a Catalog in Python, you will need to inherit from the abstract base class +[`CatalogProvider`][datafusion.catalog.CatalogProvider]. There are examples in the +[unit tests](https://github.com/apache/datafusion-python/tree/main/python/tests) of +implementing a basic Catalog in Python where we simply keep a dictionary of the +registered Schemas. + +One important note for developers is that when we have a Catalog defined in Python, we have +two different ways of accessing this Catalog. First, we register the catalog with a Rust +wrapper. This allows for any rust based code to call the Python functions as necessary. +Second, if the user access the Catalog via the Python API, we identify this and return back +the original Python object that implements the Catalog. This is an important distinction +for developers because we do *not* return a Python wrapper around the Rust wrapper of the +original Python object. diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst deleted file mode 100644 index 48ff4c014..000000000 --- a/docs/source/user-guide/data-sources.rst +++ /dev/null @@ -1,286 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _user_guide_data_sources: - -Data Sources -============ - -DataFusion provides a wide variety of ways to get data into a DataFrame to perform operations. - -Local file ----------- - -DataFusion has the ability to read from a variety of popular file formats, such as :ref:`Parquet `, -:ref:`CSV `, :ref:`JSON `, and :ref:`AVRO `. - -.. ipython:: python - - from datafusion import SessionContext - ctx = SessionContext() - df = ctx.read_csv("pokemon.csv") - df.show() - -Create in-memory ----------------- - -Sometimes it can be convenient to create a small DataFrame from a Python list or dictionary object. -To do this in DataFusion, you can use one of the three functions -:py:func:`~datafusion.context.SessionContext.from_pydict`, -:py:func:`~datafusion.context.SessionContext.from_pylist`, or -:py:func:`~datafusion.context.SessionContext.create_dataframe`. - -As their names suggest, ``from_pydict`` and ``from_pylist`` will create DataFrames from Python -dictionary and list objects, respectively. ``create_dataframe`` assumes you will pass in a list -of list of `PyArrow Record Batches `_. - -The following three examples all will create identical DataFrames: - -.. ipython:: python - - import pyarrow as pa - - ctx.from_pylist([ - { "a": 1, "b": 10.0, "c": "alpha" }, - { "a": 2, "b": 20.0, "c": "beta" }, - { "a": 3, "b": 30.0, "c": "gamma" }, - ]).show() - - ctx.from_pydict({ - "a": [1, 2, 3], - "b": [10.0, 20.0, 30.0], - "c": ["alpha", "beta", "gamma"], - }).show() - - batch = pa.RecordBatch.from_arrays( - [ - pa.array([1, 2, 3]), - pa.array([10.0, 20.0, 30.0]), - pa.array(["alpha", "beta", "gamma"]), - ], - names=["a", "b", "c"], - ) - - ctx.create_dataframe([[batch]]).show() - - -Object Store ------------- - -DataFusion has support for multiple storage options in addition to local files. -The example below requires an appropriate S3 account with access credentials. - -Supported Object Stores are - -- :py:class:`~datafusion.object_store.AmazonS3` -- :py:class:`~datafusion.object_store.GoogleCloud` -- :py:class:`~datafusion.object_store.Http` -- :py:class:`~datafusion.object_store.LocalFileSystem` -- :py:class:`~datafusion.object_store.MicrosoftAzure` - -.. code-block:: python - - from datafusion.object_store import AmazonS3 - - region = "us-east-1" - bucket_name = "yellow-trips" - - s3 = AmazonS3( - bucket_name=bucket_name, - region=region, - access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), - secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), - ) - - path = f"s3://{bucket_name}/" - ctx.register_object_store("s3://", s3, None) - - ctx.register_parquet("trips", path) - - ctx.table("trips").show() - -Other DataFrame Libraries -------------------------- - -DataFusion can import DataFrames directly from other libraries, such as -`Polars `_ and `Pandas `_. -Since DataFusion version 42.0.0, any DataFrame library that supports the Arrow FFI PyCapsule -interface can be imported to DataFusion using the -:py:func:`~datafusion.context.SessionContext.from_arrow` function. Older versions of Polars may -not support the arrow interface. In those cases, you can still import via the -:py:func:`~datafusion.context.SessionContext.from_polars` function. - -.. code-block:: python - - import pandas as pd - - data = { "a": [1, 2, 3], "b": [10.0, 20.0, 30.0], "c": ["alpha", "beta", "gamma"] } - pandas_df = pd.DataFrame(data) - - datafusion_df = ctx.from_arrow(pandas_df) - datafusion_df.show() - -.. code-block:: python - - import polars as pl - polars_df = pl.DataFrame(data) - - datafusion_df = ctx.from_arrow(polars_df) - datafusion_df.show() - -Delta Lake ----------- - -DataFusion 43.0.0 and later support the ability to register table providers from sources such -as Delta Lake. This will require a recent version of -`deltalake `_ to provide the required interfaces. - -.. code-block:: python - - from deltalake import DeltaTable - - delta_table = DeltaTable("path_to_table") - ctx.register_table("my_delta_table", delta_table) - df = ctx.table("my_delta_table") - df.show() - -On older versions of ``deltalake`` (prior to 0.22) you can use the -`Arrow DataSet `_ -interface to import to DataFusion, but this does not support features such as filter push down -which can lead to a significant performance difference. - -.. code-block:: python - - from deltalake import DeltaTable - - delta_table = DeltaTable("path_to_table") - ctx.register_dataset("my_delta_table", delta_table.to_pyarrow_dataset()) - df = ctx.table("my_delta_table") - df.show() - -Apache Iceberg --------------- - -DataFusion 45.0.0 and later support the ability to register Apache Iceberg tables as table providers through the Custom Table Provider interface. - -This requires either the `pyiceberg `__ library (>=0.10.0) or the `pyiceberg-core `__ library (>=0.5.0). - -* The ``pyiceberg-core`` library exposes Iceberg Rust's implementation of the Custom Table Provider interface as python bindings. -* The ``pyiceberg`` library utilizes the ``pyiceberg-core`` python bindings under the hood and provides a native way for Python users to interact with the DataFusion. - -.. code-block:: python - - from datafusion import SessionContext - from pyiceberg.catalog import load_catalog - import pyarrow as pa - - # Load catalog and create/load a table - catalog = load_catalog("catalog", type="in-memory") - catalog.create_namespace_if_not_exists("default") - - # Create some sample data - data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]}) - iceberg_table = catalog.create_table("default.test", schema=data.schema) - iceberg_table.append(data) - - # Register the table with DataFusion - ctx = SessionContext() - ctx.register_table_provider("test", iceberg_table) - - # Query the table using DataFusion - ctx.table("test").show() - - -Note that the Datafusion integration rely on features from the `Iceberg Rust `_ implementation instead of the `PyIceberg `_ implementation. -Features that are available in PyIceberg but not yet in Iceberg Rust will not be available when using DataFusion. - -Custom Table Provider ---------------------- - -You can implement a custom Data Provider in Rust and expose it to DataFusion through the -the interface as describe in the :ref:`Custom Table Provider ` -section. This is an advanced topic, but a -`user example `_ -is provided in the DataFusion repository. - -Catalog -======= - -A common technique for organizing tables is using a three level hierarchical approach. DataFusion -supports this form of organizing using the :py:class:`~datafusion.catalog.Catalog`, -:py:class:`~datafusion.catalog.Schema`, and :py:class:`~datafusion.catalog.Table`. By default, -a :py:class:`~datafusion.context.SessionContext` comes with a single Catalog and a single Schema -with the names ``datafusion`` and ``public``, respectively. - -The default implementation uses an in-memory approach to the catalog and schema. We have support -for adding additional in-memory catalogs and schemas. You can access tables registered in a schema -either through the Dataframe API or via sql commands. This can be done like in the following -example: - -.. code-block:: python - - import pyarrow as pa - from datafusion.catalog import Catalog, Schema - from datafusion import SessionContext - - ctx = SessionContext() - - my_catalog = Catalog.memory_catalog() - my_schema = Schema.memory_schema() - my_catalog.register_schema('my_schema_name', my_schema) - ctx.register_catalog_provider('my_catalog_name', my_catalog) - - # Create an in-memory table - table = pa.table({ - 'name': ['Bulbasaur', 'Charmander', 'Squirtle'], - 'type': ['Grass', 'Fire', 'Water'], - 'hp': [45, 39, 44], - }) - df = ctx.create_dataframe([table.to_batches()], name='pokemon') - - my_schema.register_table('pokemon', df) - - ctx.sql('SELECT * FROM my_catalog_name.my_schema_name.pokemon').show() - -User Defined Catalog and Schema -------------------------------- - -If the in-memory catalogs are insufficient for your uses, there are two approaches you can take -to implementing a custom catalog and/or schema. In the below discussion, we describe how to -implement these for a Catalog, but the approach to implementing for a Schema is nearly -identical. - -DataFusion supports Catalogs written in either Rust or Python. If you write a Catalog in Rust, -you will need to export it as a Python library via PyO3. There is a complete example of a -catalog implemented this way in the -`examples folder `_ -of our repository. Writing catalog providers in Rust provides typically can lead to significant -performance improvements over the Python based approach. - -To implement a Catalog in Python, you will need to inherit from the abstract base class -:py:class:`~datafusion.catalog.CatalogProvider`. There are examples in the -`unit tests `_ of -implementing a basic Catalog in Python where we simply keep a dictionary of the -registered Schemas. - -One important note for developers is that when we have a Catalog defined in Python, we have -two different ways of accessing this Catalog. First, we register the catalog with a Rust -wrapper. This allows for any rust based code to call the Python functions as necessary. -Second, if the user access the Catalog via the Python API, we identify this and return back -the original Python object that implements the Catalog. This is an important distinction -for developers because we do *not* return a Python wrapper around the Rust wrapper of the -original Python object. diff --git a/docs/source/user-guide/dataframe/execution-metrics.md b/docs/source/user-guide/dataframe/execution-metrics.md new file mode 100644 index 000000000..1e700e5be --- /dev/null +++ b/docs/source/user-guide/dataframe/execution-metrics.md @@ -0,0 +1,192 @@ + + + +# Execution Metrics + +## Overview + +When DataFusion executes a query it compiles the logical plan into a tree of +*physical plan operators* (e.g. `FilterExec`, `ProjectionExec`, +`HashAggregateExec`). Each operator can record runtime statistics while it +runs. These statistics are called **execution metrics**. + +Typical metrics include: + +- **output_rows** – number of rows produced by the operator +- **elapsed_compute** – total CPU time (nanoseconds) spent inside the operator +- **spill_count** – number of times the operator spilled data to disk +- **spilled_bytes** – total bytes written to disk during spills +- **spilled_rows** – total rows written to disk during spills + +Metrics are collected *per-partition*: DataFusion may execute each operator +in parallel across several partitions. The convenience properties on +[`MetricsSet`][datafusion.plan.MetricsSet] (e.g. `output_rows`, `elapsed_compute`) +automatically sum the named metric across **all** partitions, giving a single +aggregate value for the operator as a whole. You can also access the raw +per-partition [`Metric`][datafusion.plan.Metric] objects via +[`metrics`][datafusion.plan.MetricsSet.metrics]. + +## When Are Metrics Available? + +Some operators (for example `DataSourceExec`) eagerly create a +[`MetricsSet`][datafusion.plan.MetricsSet] when the physical plan is built, so +[`metrics`][datafusion.plan.ExecutionPlan.metrics] may return a set even before any +rows have been processed. However, metric **values** such as `output_rows` +are only meaningful **after** the DataFrame has been executed via one of the +terminal operations: + +- [`collect`][datafusion.dataframe.DataFrame.collect] +- [`collect_partitioned`][datafusion.dataframe.DataFrame.collect_partitioned] +- [`execute_stream`][datafusion.dataframe.DataFrame.execute_stream] + (metrics are available once the stream has been fully consumed) +- [`execute_stream_partitioned`][datafusion.dataframe.DataFrame.execute_stream_partitioned] + (metrics are available once all partition streams have been fully consumed) + +Before execution, metric values will be `0` or `None`. + +!!! note + + **display() does not populate metrics.** + When a DataFrame is displayed in a notebook (e.g. via `display(df)` or + automatic `repr` output), DataFusion runs a *limited* internal execution + to fetch preview rows. This internal execution does **not** cache the + physical plan used, so [`collect_metrics`][datafusion.plan.ExecutionPlan.collect_metrics] + will not reflect the display execution. To access metrics you must call + one of the terminal operations listed above. + +If you call [`collect`][datafusion.dataframe.DataFrame.collect] (or another terminal +operation) multiple times on the same DataFrame, each call creates a fresh +physical plan. Metrics from [`execution_plan`][datafusion.dataframe.DataFrame.execution_plan] +always reflect the **most recent** execution. + +## Reading the Physical Plan Tree + +[`execution_plan`][datafusion.dataframe.DataFrame.execution_plan] returns the root +[`ExecutionPlan`][datafusion.plan.ExecutionPlan] node of the physical plan tree. The tree +mirrors the operator pipeline: the root is typically a projection or +coalescing node; its children are filters, aggregates, scans, etc. + +The `operator_name` string returned by +[`collect_metrics`][datafusion.plan.ExecutionPlan.collect_metrics] is the *display* name of +the node, for example `"FilterExec: column1@0 > 1"`. This is the same string +you would see when calling `plan.display()`. + +## Aggregated vs Per-Partition Metrics + +DataFusion executes each operator across one or more **partitions** in +parallel. The [`MetricsSet`][datafusion.plan.MetricsSet] convenience properties +(`output_rows`, `elapsed_compute`, etc.) automatically **sum** the named +metric across all partitions, giving a single aggregate value. + +To inspect individual partitions — for example to detect data skew where one +partition processes far more rows than others — iterate over the raw +[`Metric`][datafusion.plan.Metric] objects: + +```python +for metric in metrics_set.metrics(): + print(f" partition={metric.partition} {metric.name}={metric.value}") +``` + +The `partition` property is a 0-based index (`0`, `1`, …) identifying +which parallel slot processed this metric. It is `None` for metrics that +apply globally (not tied to a specific partition). + +## Available Metrics + +The following metrics are directly accessible as properties on +[`MetricsSet`][datafusion.plan.MetricsSet]: + +| Property | Description | +|----------|-------------| +| `output_rows` | Number of rows emitted by the operator (summed across partitions). | +| `elapsed_compute` | Wall-clock CPU time **in nanoseconds** spent inside the operator's compute loop, excluding I/O wait. Useful for identifying which operators are most expensive (summed across partitions). | +| `spill_count` | Number of spill-to-disk events triggered by memory pressure. This is a unitless count of events, not a measure of data volume (summed across partitions). | +| `spilled_bytes` | Total bytes written to disk during spill events (summed across partitions). | +| `spilled_rows` | Total rows written to disk during spill events (summed across partitions). | + +Any metric not listed above can be accessed via +[`sum_by_name`][datafusion.plan.MetricsSet.sum_by_name], or by iterating over the raw +[`Metric`][datafusion.plan.Metric] objects returned by +[`metrics`][datafusion.plan.MetricsSet.metrics]. + +## Labels + +A [`Metric`][datafusion.plan.Metric] may carry *labels*: key/value pairs that +provide additional context. Labels are operator-specific; most metrics have +an empty label dict. + +Some operators tag their metrics with labels to distinguish variants. For +example, a `HashAggregateExec` may record separate `output_rows` metrics +for intermediate and final output: + +```python +for metric in metrics_set.metrics(): + print(metric.name, metric.labels()) +# output_rows {'output_type': 'final'} +# output_rows {'output_type': 'intermediate'} +``` + +When summing by name (via [`output_rows`][datafusion.plan.MetricsSet.output_rows] or +[`sum_by_name`][datafusion.plan.MetricsSet.sum_by_name]), **all** metrics with that +name are summed regardless of labels. To filter by label, iterate over the +raw [`Metric`][datafusion.plan.Metric] objects directly. + +## End-to-End Example + +```python +from datafusion import SessionContext + +ctx = SessionContext() +ctx.sql("CREATE TABLE sales AS VALUES (1, 100), (2, 200), (3, 50)") + +df = ctx.sql("SELECT * FROM sales WHERE column1 > 1") + +# Execute the query — this populates the metrics +results = df.collect() + +# Retrieve the physical plan with metrics +plan = df.execution_plan() + +# Walk every operator and print its metrics +for operator_name, ms in plan.collect_metrics(): + if ms.output_rows is not None: + print(f"{operator_name}") + print(f" output_rows = {ms.output_rows}") + print(f" elapsed_compute = {ms.elapsed_compute} ns") + +# Access raw per-partition metrics +for operator_name, ms in plan.collect_metrics(): + for metric in ms.metrics(): + print( + f" partition={metric.partition} " + f"{metric.name}={metric.value} " + f"labels={metric.labels()}" + ) +``` + +## API Reference + +- [`ExecutionPlan`][datafusion.plan.ExecutionPlan] — physical plan node +- [`collect_metrics`][datafusion.plan.ExecutionPlan.collect_metrics] — walk the tree and + return `(operator_name, MetricsSet)` pairs +- [`metrics`][datafusion.plan.ExecutionPlan.metrics] — return the + [`MetricsSet`][datafusion.plan.MetricsSet] for a single node +- [`MetricsSet`][datafusion.plan.MetricsSet] — aggregated metrics for one operator +- [`Metric`][datafusion.plan.Metric] — a single per-partition metric value diff --git a/docs/source/user-guide/dataframe/execution-metrics.rst b/docs/source/user-guide/dataframe/execution-metrics.rst deleted file mode 100644 index 764fa76ef..000000000 --- a/docs/source/user-guide/dataframe/execution-metrics.rst +++ /dev/null @@ -1,215 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _execution_metrics: - -Execution Metrics -================= - -Overview --------- - -When DataFusion executes a query it compiles the logical plan into a tree of -*physical plan operators* (e.g. ``FilterExec``, ``ProjectionExec``, -``HashAggregateExec``). Each operator can record runtime statistics while it -runs. These statistics are called **execution metrics**. - -Typical metrics include: - -- **output_rows** – number of rows produced by the operator -- **elapsed_compute** – total CPU time (nanoseconds) spent inside the operator -- **spill_count** – number of times the operator spilled data to disk -- **spilled_bytes** – total bytes written to disk during spills -- **spilled_rows** – total rows written to disk during spills - -Metrics are collected *per-partition*: DataFusion may execute each operator -in parallel across several partitions. The convenience properties on -:py:class:`~datafusion.MetricsSet` (e.g. ``output_rows``, ``elapsed_compute``) -automatically sum the named metric across **all** partitions, giving a single -aggregate value for the operator as a whole. You can also access the raw -per-partition :py:class:`~datafusion.Metric` objects via -:py:meth:`~datafusion.MetricsSet.metrics`. - -When Are Metrics Available? ---------------------------- - -Some operators (for example ``DataSourceExec``) eagerly create a -:py:class:`~datafusion.MetricsSet` when the physical plan is built, so -:py:meth:`~datafusion.ExecutionPlan.metrics` may return a set even before any -rows have been processed. However, metric **values** such as ``output_rows`` -are only meaningful **after** the DataFrame has been executed via one of the -terminal operations: - -- :py:meth:`~datafusion.DataFrame.collect` -- :py:meth:`~datafusion.DataFrame.collect_partitioned` -- :py:meth:`~datafusion.DataFrame.execute_stream` - (metrics are available once the stream has been fully consumed) -- :py:meth:`~datafusion.DataFrame.execute_stream_partitioned` - (metrics are available once all partition streams have been fully consumed) - -Before execution, metric values will be ``0`` or ``None``. - -.. note:: - - **display() does not populate metrics.** - When a DataFrame is displayed in a notebook (e.g. via ``display(df)`` or - automatic ``repr`` output), DataFusion runs a *limited* internal execution - to fetch preview rows. This internal execution does **not** cache the - physical plan used, so :py:meth:`~datafusion.ExecutionPlan.collect_metrics` - will not reflect the display execution. To access metrics you must call - one of the terminal operations listed above. - -If you call :py:meth:`~datafusion.DataFrame.collect` (or another terminal -operation) multiple times on the same DataFrame, each call creates a fresh -physical plan. Metrics from :py:meth:`~datafusion.DataFrame.execution_plan` -always reflect the **most recent** execution. - -Reading the Physical Plan Tree --------------------------------- - -:py:meth:`~datafusion.DataFrame.execution_plan` returns the root -:py:class:`~datafusion.ExecutionPlan` node of the physical plan tree. The tree -mirrors the operator pipeline: the root is typically a projection or -coalescing node; its children are filters, aggregates, scans, etc. - -The ``operator_name`` string returned by -:py:meth:`~datafusion.ExecutionPlan.collect_metrics` is the *display* name of -the node, for example ``"FilterExec: column1@0 > 1"``. This is the same string -you would see when calling ``plan.display()``. - -Aggregated vs Per-Partition Metrics ------------------------------------- - -DataFusion executes each operator across one or more **partitions** in -parallel. The :py:class:`~datafusion.MetricsSet` convenience properties -(``output_rows``, ``elapsed_compute``, etc.) automatically **sum** the named -metric across all partitions, giving a single aggregate value. - -To inspect individual partitions — for example to detect data skew where one -partition processes far more rows than others — iterate over the raw -:py:class:`~datafusion.Metric` objects: - -.. code-block:: python - - for metric in metrics_set.metrics(): - print(f" partition={metric.partition} {metric.name}={metric.value}") - -The ``partition`` property is a 0-based index (``0``, ``1``, …) identifying -which parallel slot processed this metric. It is ``None`` for metrics that -apply globally (not tied to a specific partition). - -Available Metrics ------------------ - -The following metrics are directly accessible as properties on -:py:class:`~datafusion.MetricsSet`: - -.. list-table:: - :header-rows: 1 - :widths: 25 75 - - * - Property - - Description - * - ``output_rows`` - - Number of rows emitted by the operator (summed across partitions). - * - ``elapsed_compute`` - - Wall-clock CPU time **in nanoseconds** spent inside the operator's - compute loop, excluding I/O wait. Useful for identifying which - operators are most expensive (summed across partitions). - * - ``spill_count`` - - Number of spill-to-disk events triggered by memory pressure. This is - a unitless count of events, not a measure of data volume (summed across - partitions). - * - ``spilled_bytes`` - - Total bytes written to disk during spill events (summed across - partitions). - * - ``spilled_rows`` - - Total rows written to disk during spill events (summed across - partitions). - -Any metric not listed above can be accessed via -:py:meth:`~datafusion.MetricsSet.sum_by_name`, or by iterating over the raw -:py:class:`~datafusion.Metric` objects returned by -:py:meth:`~datafusion.MetricsSet.metrics`. - -Labels ------- - -A :py:class:`~datafusion.Metric` may carry *labels*: key/value pairs that -provide additional context. Labels are operator-specific; most metrics have -an empty label dict. - -Some operators tag their metrics with labels to distinguish variants. For -example, a ``HashAggregateExec`` may record separate ``output_rows`` metrics -for intermediate and final output: - -.. code-block:: python - - for metric in metrics_set.metrics(): - print(metric.name, metric.labels()) - # output_rows {'output_type': 'final'} - # output_rows {'output_type': 'intermediate'} - -When summing by name (via :py:attr:`~datafusion.MetricsSet.output_rows` or -:py:meth:`~datafusion.MetricsSet.sum_by_name`), **all** metrics with that -name are summed regardless of labels. To filter by label, iterate over the -raw :py:class:`~datafusion.Metric` objects directly. - -End-to-End Example ------------------- - -.. code-block:: python - - from datafusion import SessionContext - - ctx = SessionContext() - ctx.sql("CREATE TABLE sales AS VALUES (1, 100), (2, 200), (3, 50)") - - df = ctx.sql("SELECT * FROM sales WHERE column1 > 1") - - # Execute the query — this populates the metrics - results = df.collect() - - # Retrieve the physical plan with metrics - plan = df.execution_plan() - - # Walk every operator and print its metrics - for operator_name, ms in plan.collect_metrics(): - if ms.output_rows is not None: - print(f"{operator_name}") - print(f" output_rows = {ms.output_rows}") - print(f" elapsed_compute = {ms.elapsed_compute} ns") - - # Access raw per-partition metrics - for operator_name, ms in plan.collect_metrics(): - for metric in ms.metrics(): - print( - f" partition={metric.partition} " - f"{metric.name}={metric.value} " - f"labels={metric.labels()}" - ) - -API Reference -------------- - -- :py:class:`datafusion.ExecutionPlan` — physical plan node -- :py:meth:`datafusion.ExecutionPlan.collect_metrics` — walk the tree and - return ``(operator_name, MetricsSet)`` pairs -- :py:meth:`datafusion.ExecutionPlan.metrics` — return the - :py:class:`~datafusion.MetricsSet` for a single node -- :py:class:`datafusion.MetricsSet` — aggregated metrics for one operator -- :py:class:`datafusion.Metric` — a single per-partition metric value diff --git a/docs/source/user-guide/dataframe/index.md b/docs/source/user-guide/dataframe/index.md new file mode 100644 index 000000000..c11448999 --- /dev/null +++ b/docs/source/user-guide/dataframe/index.md @@ -0,0 +1,370 @@ + + +# DataFrames + +## Overview + +The [`DataFrame`][datafusion.dataframe.DataFrame] class is the core abstraction in DataFusion that represents tabular data and operations +on that data. DataFrames provide a flexible API for transforming data through various operations such as +filtering, projection, aggregation, joining, and more. + +A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when +terminal operations like [`collect()`][datafusion.dataframe.DataFrame.collect], [`show()`][datafusion.dataframe.DataFrame.show], or [`to_pandas()`][datafusion.dataframe.DataFrame.to_pandas] are called. + +## Creating DataFrames + +DataFrames can be created in several ways: + +- From SQL queries via a [`SessionContext`][datafusion.context.SessionContext]: + + ```python + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.sql("SELECT * FROM your_table") + ``` + +- From registered tables: + + ```python + df = ctx.table("your_table") + ``` + +- From various data sources: + + ```python + # From CSV files (see [io_csv](/python/user-guide/io/csv/) for detailed options) + df = ctx.read_csv("path/to/data.csv") + + # From Parquet files (see [io_parquet](/python/user-guide/io/parquet/) for detailed options) + df = ctx.read_parquet("path/to/data.parquet") + + # From JSON files (see [io_json](/python/user-guide/io/json/) for detailed options) + df = ctx.read_json("path/to/data.json") + + # From Avro files (see [io_avro](/python/user-guide/io/avro/) for detailed options) + df = ctx.read_avro("path/to/data.avro") + + # From Pandas DataFrame + import pandas as pd + pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = ctx.from_pandas(pandas_df) + + # From Arrow data + import pyarrow as pa + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"] + ) + df = ctx.from_arrow(batch) + ``` + +For detailed information about reading from different data sources, see the [I/O Guide](../io/index.md). +For custom data sources, see [io_custom_table_provider](../../user-guide/io/table_provider.md). + +## Common DataFrame Operations + +DataFusion's DataFrame API offers a wide range of operations: + +```python +from datafusion import column, literal + +# Select specific columns +df = df.select("col1", "col2") + +# Select with expressions +df = df.select(column("a") + column("b"), column("a") - column("b")) + +# Filter rows (expressions or SQL strings) +df = df.filter(column("age") > literal(25)) +df = df.filter("age > 25") + +# Add computed columns +df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name")) + +# Multiple column additions +df = df.with_columns( + (column("a") + column("b")).alias("sum"), + (column("a") * column("b")).alias("product") +) + +# Sort data +df = df.sort(column("age").sort(ascending=False)) + +# Join DataFrames +df = df1.join(df2, on="user_id", how="inner") + +# Aggregate data +from datafusion import functions as f +df = df.aggregate( + [], # Group by columns (empty for global aggregation) + [f.sum(column("amount")).alias("total_amount")] +) + +# Limit rows +df = df.limit(100) + +# Drop columns +df = df.drop("temporary_column") +``` + +## Column Names as Function Arguments + +Some [`DataFrame`][datafusion.dataframe.DataFrame] methods accept column names when an argument refers to an +existing column. These include: + +- [`select`][datafusion.dataframe.DataFrame.select] +- [`sort`][datafusion.dataframe.DataFrame.sort] +- [`drop`][datafusion.dataframe.DataFrame.drop] +- [`join`][datafusion.dataframe.DataFrame.join] (`on` argument) +- [`aggregate`][datafusion.dataframe.DataFrame.aggregate] (grouping columns) + +See the full function documentation for details on any specific function. + +Note that [`join_on`][datafusion.dataframe.DataFrame.join_on] expects [`col()`][datafusion.col.col]/[`column()`][datafusion.col.column] expressions rather than plain strings. + +For such methods, you can pass column names directly: + +```python +from datafusion import col, functions as f + +df.sort('id') +df.aggregate('id', [f.count(col('value'))]) +``` + +The same operation can also be written with explicit column expressions, using either [`col()`][datafusion.col.col] or [`column()`][datafusion.col.column]: + +```python +from datafusion import col, column, functions as f + +df.sort(col('id')) +df.aggregate(column('id'), [f.count(col('value'))]) +``` + +Note that [`column()`][datafusion.col.column] is an alias of [`col()`][datafusion.col.col], so you can use either name; the example above shows both in action. + +Whenever an argument represents an expression—such as in +[`filter`][datafusion.dataframe.DataFrame.filter] or +[`with_column`][datafusion.dataframe.DataFrame.with_column]—use [`col()`][datafusion.col.col] to reference +columns. The comparison and arithmetic operators on [`Expr`][datafusion.expr.Expr] will automatically +convert any non-[`Expr`][datafusion.expr.Expr] value into a literal expression, so writing + +```python +from datafusion import col +df.filter(col("age") > 21) +``` + +is equivalent to using `lit(21)` explicitly. Use [`lit()`][datafusion.lit] (also available +as [`literal()`][datafusion.literal]) when you need to construct a literal expression directly. + +## Terminal Operations + +To materialize the results of your DataFrame operations: + +```python +# Collect all data as PyArrow RecordBatches +result_batches = df.collect() + +# Convert to various formats +pandas_df = df.to_pandas() # Pandas DataFrame +polars_df = df.to_polars() # Polars DataFrame +arrow_table = df.to_arrow_table() # PyArrow Table +py_dict = df.to_pydict() # Python dictionary +py_list = df.to_pylist() # Python list of dictionaries + +# Display results +df.show() # Print tabular format to console + +# Count rows +count = df.count() + +# Collect a single column of data as a PyArrow Array +arr = df.collect_column("age") +``` + +## Zero-copy streaming to Arrow-based Python libraries + +DataFusion DataFrames implement the `__arrow_c_stream__` protocol, enabling +zero-copy, lazy streaming into Arrow-based Python libraries. With the streaming +protocol, batches are produced on demand. + +!!! note + + The protocol is implementation-agnostic and works with any Python library + that understands the Arrow C streaming interface (for example, PyArrow + or other Arrow-compatible implementations). The sections below provide a + short PyArrow-specific example and general guidance for other + implementations. + +## PyArrow + +```python +import pyarrow as pa + +# Create a PyArrow RecordBatchReader without materializing all batches +reader = pa.RecordBatchReader.from_stream(df) +for batch in reader: + ... # process each batch as it is produced +``` + +DataFrames are also iterable, yielding [`RecordBatch`][datafusion.RecordBatch] +objects lazily so you can loop over results directly without importing +PyArrow: + +```python +for batch in df: + ... # each batch is a ``datafusion.RecordBatch`` +``` + +Each batch exposes [`to_pyarrow()`][datafusion.record_batch.RecordBatch.to_pyarrow], allowing conversion to a PyArrow +table. `pa.table(df)` collects the entire DataFrame eagerly into a +PyArrow table: + +```python +import pyarrow as pa +table = pa.table(df) +``` + +Asynchronous iteration is supported as well, allowing integration with +`asyncio` event loops: + +```python +async for batch in df: + ... # process each batch as it is produced +``` + +To work with the stream directly, use [`execute_stream()`][datafusion.dataframe.DataFrame.execute_stream], which returns a +[`RecordBatchStream`][datafusion.RecordBatchStream]. + +```python +stream = df.execute_stream() +for batch in stream: + ... +``` + +### Execute as Stream + +For finer control over streaming execution, use +[`execute_stream`][datafusion.dataframe.DataFrame.execute_stream] to obtain a +[`RecordBatchStream`][datafusion.record_batch.RecordBatchStream]: + +```python +stream = df.execute_stream() +for batch in stream: + ... # process each batch as it is produced +``` + +!!! tip + + To get a PyArrow reader instead, call + + `pa.RecordBatchReader.from_stream(df)`. + +When partition boundaries are important, +[`execute_stream_partitioned`][datafusion.dataframe.DataFrame.execute_stream_partitioned] +returns an iterable of [`RecordBatchStream`][datafusion.record_batch.RecordBatchStream] objects, one per +partition: + +```python +for stream in df.execute_stream_partitioned(): + for batch in stream: + ... # each stream yields RecordBatches +``` + +To process partitions concurrently, first collect the streams into a list +and then poll each one in a separate `asyncio` task: + +```python +import asyncio + +async def consume(stream): + async for batch in stream: + ... + +streams = list(df.execute_stream_partitioned()) +await asyncio.gather(*(consume(s) for s in streams)) +``` + +See [../io/arrow](../io/arrow.md) for additional details on the Arrow interface. + +## HTML Rendering + +When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will +automatically display as formatted HTML tables. For detailed information about customizing HTML +rendering, formatting options, and advanced styling, see [rendering](rendering.md). + +## Core Classes + +**DataFrame** + +: The main DataFrame class for building and executing queries. + + See: [`DataFrame`][datafusion.dataframe.DataFrame] + +**SessionContext** + +: The primary entry point for creating DataFrames from various data sources. + + Key methods for DataFrame creation: + + - [`read_csv`][datafusion.context.SessionContext.read_csv] - Read CSV files + - [`read_parquet`][datafusion.context.SessionContext.read_parquet] - Read Parquet files + - [`read_json`][datafusion.context.SessionContext.read_json] - Read JSON files + - [`read_avro`][datafusion.context.SessionContext.read_avro] - Read Avro files + - [`table`][datafusion.context.SessionContext.table] - Access registered tables + - [`sql`][datafusion.context.SessionContext.sql] - Execute SQL queries + - [`from_pandas`][datafusion.context.SessionContext.from_pandas] - Create from Pandas DataFrame + - [`from_arrow`][datafusion.context.SessionContext.from_arrow] - Create from Arrow data + + See: [`SessionContext`][datafusion.context.SessionContext] + +## Expression Classes + +**Expr** + +: Represents expressions that can be used in DataFrame operations. + + See: [`Expr`][datafusion.expr.Expr] + +**Functions for creating expressions:** + +- [`column`][datafusion.col.column] - Reference a column by name +- [`literal`][datafusion.literal] - Create a literal value expression + +## Built-in Functions + +DataFusion provides many built-in functions for data manipulation: + +- [`functions`][datafusion.functions] - Mathematical, string, date/time, and aggregation functions + +For a complete list of available functions, see the [`functions`][datafusion.functions] module documentation. + +## Execution Metrics + +After executing a DataFrame (via [`collect()`][datafusion.dataframe.DataFrame.collect], [`execute_stream()`][datafusion.dataframe.DataFrame.execute_stream], etc.), +DataFusion populates per-operator runtime statistics such as row counts and +compute time. See [execution-metrics](execution-metrics.md) for a full explanation and +worked example. + +## Further reading + +- [Rendering](rendering.md) — Jupyter HTML repr customization. +- [Execution Metrics](execution-metrics.md) — per-operator row counts, + compute time, spill events. diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst deleted file mode 100644 index 8475a7bd7..000000000 --- a/docs/source/user-guide/dataframe/index.rst +++ /dev/null @@ -1,380 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -DataFrames -========== - -Overview --------- - -The ``DataFrame`` class is the core abstraction in DataFusion that represents tabular data and operations -on that data. DataFrames provide a flexible API for transforming data through various operations such as -filtering, projection, aggregation, joining, and more. - -A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when -terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called. - -Creating DataFrames -------------------- - -DataFrames can be created in several ways: - -* From SQL queries via a ``SessionContext``: - - .. code-block:: python - - from datafusion import SessionContext - - ctx = SessionContext() - df = ctx.sql("SELECT * FROM your_table") - -* From registered tables: - - .. code-block:: python - - df = ctx.table("your_table") - -* From various data sources: - - .. code-block:: python - - # From CSV files (see :ref:`io_csv` for detailed options) - df = ctx.read_csv("path/to/data.csv") - - # From Parquet files (see :ref:`io_parquet` for detailed options) - df = ctx.read_parquet("path/to/data.parquet") - - # From JSON files (see :ref:`io_json` for detailed options) - df = ctx.read_json("path/to/data.json") - - # From Avro files (see :ref:`io_avro` for detailed options) - df = ctx.read_avro("path/to/data.avro") - - # From Pandas DataFrame - import pandas as pd - pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = ctx.from_pandas(pandas_df) - - # From Arrow data - import pyarrow as pa - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"] - ) - df = ctx.from_arrow(batch) - -For detailed information about reading from different data sources, see the :doc:`I/O Guide <../io/index>`. -For custom data sources, see :ref:`io_custom_table_provider`. - -Common DataFrame Operations ---------------------------- - -DataFusion's DataFrame API offers a wide range of operations: - -.. code-block:: python - - from datafusion import column, literal - - # Select specific columns - df = df.select("col1", "col2") - - # Select with expressions - df = df.select(column("a") + column("b"), column("a") - column("b")) - - # Filter rows (expressions or SQL strings) - df = df.filter(column("age") > literal(25)) - df = df.filter("age > 25") - - # Add computed columns - df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name")) - - # Multiple column additions - df = df.with_columns( - (column("a") + column("b")).alias("sum"), - (column("a") * column("b")).alias("product") - ) - - # Sort data - df = df.sort(column("age").sort(ascending=False)) - - # Join DataFrames - df = df1.join(df2, on="user_id", how="inner") - - # Aggregate data - from datafusion import functions as f - df = df.aggregate( - [], # Group by columns (empty for global aggregation) - [f.sum(column("amount")).alias("total_amount")] - ) - - # Limit rows - df = df.limit(100) - - # Drop columns - df = df.drop("temporary_column") - -Column Names as Function Arguments ----------------------------------- - -Some ``DataFrame`` methods accept column names when an argument refers to an -existing column. These include: - -* :py:meth:`~datafusion.DataFrame.select` -* :py:meth:`~datafusion.DataFrame.sort` -* :py:meth:`~datafusion.DataFrame.drop` -* :py:meth:`~datafusion.DataFrame.join` (``on`` argument) -* :py:meth:`~datafusion.DataFrame.aggregate` (grouping columns) - -See the full function documentation for details on any specific function. - -Note that :py:meth:`~datafusion.DataFrame.join_on` expects ``col()``/``column()`` expressions rather than plain strings. - -For such methods, you can pass column names directly: - -.. code-block:: python - - from datafusion import col, functions as f - - df.sort('id') - df.aggregate('id', [f.count(col('value'))]) - -The same operation can also be written with explicit column expressions, using either ``col()`` or ``column()``: - -.. code-block:: python - - from datafusion import col, column, functions as f - - df.sort(col('id')) - df.aggregate(column('id'), [f.count(col('value'))]) - -Note that ``column()`` is an alias of ``col()``, so you can use either name; the example above shows both in action. - -Whenever an argument represents an expression—such as in -:py:meth:`~datafusion.DataFrame.filter` or -:py:meth:`~datafusion.DataFrame.with_column`—use ``col()`` to reference -columns. The comparison and arithmetic operators on ``Expr`` will automatically -convert any non-``Expr`` value into a literal expression, so writing - -.. code-block:: python - - from datafusion import col - df.filter(col("age") > 21) - -is equivalent to using ``lit(21)`` explicitly. Use ``lit()`` (also available -as ``literal()``) when you need to construct a literal expression directly. - -Terminal Operations -------------------- - -To materialize the results of your DataFrame operations: - -.. code-block:: python - - # Collect all data as PyArrow RecordBatches - result_batches = df.collect() - - # Convert to various formats - pandas_df = df.to_pandas() # Pandas DataFrame - polars_df = df.to_polars() # Polars DataFrame - arrow_table = df.to_arrow_table() # PyArrow Table - py_dict = df.to_pydict() # Python dictionary - py_list = df.to_pylist() # Python list of dictionaries - - # Display results - df.show() # Print tabular format to console - - # Count rows - count = df.count() - - # Collect a single column of data as a PyArrow Array - arr = df.collect_column("age") - -Zero-copy streaming to Arrow-based Python libraries ---------------------------------------------------- - -DataFusion DataFrames implement the ``__arrow_c_stream__`` protocol, enabling -zero-copy, lazy streaming into Arrow-based Python libraries. With the streaming -protocol, batches are produced on demand. - -.. note:: - - The protocol is implementation-agnostic and works with any Python library - that understands the Arrow C streaming interface (for example, PyArrow - or other Arrow-compatible implementations). The sections below provide a - short PyArrow-specific example and general guidance for other - implementations. - -PyArrow -------- - -.. code-block:: python - - import pyarrow as pa - - # Create a PyArrow RecordBatchReader without materializing all batches - reader = pa.RecordBatchReader.from_stream(df) - for batch in reader: - ... # process each batch as it is produced - -DataFrames are also iterable, yielding :class:`datafusion.RecordBatch` -objects lazily so you can loop over results directly without importing -PyArrow: - -.. code-block:: python - - for batch in df: - ... # each batch is a ``datafusion.RecordBatch`` - -Each batch exposes ``to_pyarrow()``, allowing conversion to a PyArrow -table. ``pa.table(df)`` collects the entire DataFrame eagerly into a -PyArrow table: - -.. code-block:: python - - import pyarrow as pa - table = pa.table(df) - -Asynchronous iteration is supported as well, allowing integration with -``asyncio`` event loops: - -.. code-block:: python - - async for batch in df: - ... # process each batch as it is produced - -To work with the stream directly, use ``execute_stream()``, which returns a -:class:`~datafusion.RecordBatchStream`. - -.. code-block:: python - - stream = df.execute_stream() - for batch in stream: - ... - -Execute as Stream -^^^^^^^^^^^^^^^^^ - -For finer control over streaming execution, use -:py:meth:`~datafusion.DataFrame.execute_stream` to obtain a -:py:class:`datafusion.RecordBatchStream`: - -.. code-block:: python - - stream = df.execute_stream() - for batch in stream: - ... # process each batch as it is produced - -.. tip:: - - To get a PyArrow reader instead, call - - ``pa.RecordBatchReader.from_stream(df)``. - -When partition boundaries are important, -:py:meth:`~datafusion.DataFrame.execute_stream_partitioned` -returns an iterable of :py:class:`datafusion.RecordBatchStream` objects, one per -partition: - -.. code-block:: python - - for stream in df.execute_stream_partitioned(): - for batch in stream: - ... # each stream yields RecordBatches - -To process partitions concurrently, first collect the streams into a list -and then poll each one in a separate ``asyncio`` task: - -.. code-block:: python - - import asyncio - - async def consume(stream): - async for batch in stream: - ... - - streams = list(df.execute_stream_partitioned()) - await asyncio.gather(*(consume(s) for s in streams)) - -See :doc:`../io/arrow` for additional details on the Arrow interface. - -HTML Rendering --------------- - -When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will -automatically display as formatted HTML tables. For detailed information about customizing HTML -rendering, formatting options, and advanced styling, see :doc:`rendering`. - -Core Classes ------------- - -**DataFrame** - The main DataFrame class for building and executing queries. - - See: :py:class:`datafusion.DataFrame` - -**SessionContext** - The primary entry point for creating DataFrames from various data sources. - - Key methods for DataFrame creation: - - * :py:meth:`~datafusion.SessionContext.read_csv` - Read CSV files - * :py:meth:`~datafusion.SessionContext.read_parquet` - Read Parquet files - * :py:meth:`~datafusion.SessionContext.read_json` - Read JSON files - * :py:meth:`~datafusion.SessionContext.read_avro` - Read Avro files - * :py:meth:`~datafusion.SessionContext.table` - Access registered tables - * :py:meth:`~datafusion.SessionContext.sql` - Execute SQL queries - * :py:meth:`~datafusion.SessionContext.from_pandas` - Create from Pandas DataFrame - * :py:meth:`~datafusion.SessionContext.from_arrow` - Create from Arrow data - - See: :py:class:`datafusion.SessionContext` - -Expression Classes ------------------- - -**Expr** - Represents expressions that can be used in DataFrame operations. - - See: :py:class:`datafusion.Expr` - -**Functions for creating expressions:** - -* :py:func:`datafusion.column` - Reference a column by name -* :py:func:`datafusion.literal` - Create a literal value expression - -Built-in Functions ------------------- - -DataFusion provides many built-in functions for data manipulation: - -* :py:mod:`datafusion.functions` - Mathematical, string, date/time, and aggregation functions - -For a complete list of available functions, see the :py:mod:`datafusion.functions` module documentation. - - -Execution Metrics ------------------ - -After executing a DataFrame (via ``collect()``, ``execute_stream()``, etc.), -DataFusion populates per-operator runtime statistics such as row counts and -compute time. See :doc:`execution-metrics` for a full explanation and -worked example. - -.. toctree:: - :maxdepth: 1 - - rendering - execution-metrics diff --git a/docs/source/user-guide/dataframe/rendering.md b/docs/source/user-guide/dataframe/rendering.md new file mode 100644 index 000000000..0b668b985 --- /dev/null +++ b/docs/source/user-guide/dataframe/rendering.md @@ -0,0 +1,217 @@ + + +# DataFrame Rendering + +DataFusion provides configurable rendering for DataFrames in both plain text and HTML +formats. The [`datafusion.dataframe_formatter`](../../reference/datafusion/dataframe_formatter.md) module controls how DataFrames are +displayed in Jupyter notebooks (via `_repr_html_`), in the terminal (via `__repr__`), +and anywhere else a string or HTML representation is needed. + + +## Basic Rendering + +In a Jupyter environment, displaying a DataFrame triggers HTML rendering: + +```python +# Will display as HTML table in Jupyter +df + +# Explicit display also uses HTML rendering +display(df) +``` + +In a terminal or when converting to string, plain text rendering is used: + +```python +# Plain text table output +print(df) +``` + +## Configuring the Formatter + +You can customize how DataFrames are rendered by configuring the global formatter: + +```python +from datafusion.dataframe_formatter import configure_formatter + +configure_formatter( + max_cell_length=25, # Maximum characters in a cell before truncation + max_width=1000, # Maximum width in pixels (HTML only) + max_height=300, # Maximum height in pixels (HTML only) + max_memory_bytes=2097152, # Maximum memory for rendering (2MB) + min_rows=10, # Minimum number of rows to display + max_rows=10, # Maximum rows to display + enable_cell_expansion=True, # Allow expanding truncated cells (HTML only) + custom_css=None, # Additional custom CSS (HTML only) + show_truncation_message=True, # Show message when data is truncated + style_provider=None, # Custom styling provider (HTML only) + use_shared_styles=True, # Share styles across tables (HTML only) +) +``` + +The formatter settings affect all DataFrames displayed after configuration. + +## Custom Style Providers + +For HTML styling, you can create a custom style provider that implements the +[`StyleProvider`][datafusion.dataframe_formatter.StyleProvider] protocol: + +```python +from datafusion.dataframe_formatter import configure_formatter + +class MyStyleProvider: + def get_cell_style(self): + """Return CSS style string for table data cells.""" + return "border: 1px solid #ddd; padding: 8px; text-align: left;" + + def get_header_style(self): + """Return CSS style string for table header cells.""" + return ( + "background-color: #007bff; color: white; " + "padding: 8px; text-align: left;" + ) + +# Apply the custom style provider +configure_formatter(style_provider=MyStyleProvider()) +``` + +## Custom Cell Formatters + +You can register custom formatters for specific Python types. A cell formatter is any +callable that takes a value and returns a string: + +```python +from datafusion.dataframe_formatter import get_formatter + +formatter = get_formatter() + +# Format floats to 2 decimal places +formatter.register_formatter(float, lambda v: f"{v:.2f}") + +# Format dates in a custom way +from datetime import date +formatter.register_formatter(date, lambda v: v.strftime("%B %d, %Y")) +``` + +## Custom Cell and Header Builders + +For full control over the HTML of individual cells or headers, you can set custom +builder functions: + +```python +from datafusion.dataframe_formatter import get_formatter + +formatter = get_formatter() + +# Custom cell builder receives (value, row, col, table_id) and returns HTML +def my_cell_builder(value, row, col, table_id): + color = "red" if isinstance(value, (int, float)) and value < 0 else "black" + return f"{value}" + +formatter.set_custom_cell_builder(my_cell_builder) + +# Custom header builder receives a schema field and returns HTML +def my_header_builder(field): + return f"{field.name}" + +formatter.set_custom_header_builder(my_header_builder) +``` + +## Performance Optimization with Shared Styles + +The `use_shared_styles` parameter (enabled by default) optimizes performance when +displaying multiple DataFrames in notebook environments: + +```python +from datafusion.dataframe_formatter import configure_formatter + +# Default: Use shared styles (recommended for notebooks) +configure_formatter(use_shared_styles=True) + +# Disable shared styles (each DataFrame includes its own styles) +configure_formatter(use_shared_styles=False) +``` + +When `use_shared_styles=True`: + +- CSS styles and JavaScript are included only once per notebook session +- This reduces HTML output size and prevents style duplication +- Improves rendering performance with many DataFrames +- Applies consistent styling across all DataFrames + +## Working with the Formatter Directly + +You can use [`get_formatter()`][datafusion.dataframe_formatter.get_formatter] and [`set_formatter()`][datafusion.dataframe_formatter.set_formatter] for direct access to the global +formatter instance: + +```python +from datafusion.dataframe_formatter import ( + DataFrameHtmlFormatter, + get_formatter, + set_formatter, +) + +# Get and modify the current formatter +formatter = get_formatter() +print(formatter.max_rows) +print(formatter.max_cell_length) + +# Create and set a fully custom formatter +custom_formatter = DataFrameHtmlFormatter( + max_cell_length=50, + max_rows=20, + enable_cell_expansion=False, +) +set_formatter(custom_formatter) +``` + +Reset to default formatting: + +```python +from datafusion.dataframe_formatter import reset_formatter + +# Reset to default settings +reset_formatter() +``` + +## Memory and Display Controls + +You can control how much data is displayed and how much memory is used for rendering: + +```python +from datafusion.dataframe_formatter import configure_formatter + +configure_formatter( + max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display + min_rows=20, # Always show at least 20 rows + max_rows=50, # Show up to 50 rows in output +) +``` + +These parameters help balance comprehensive data display against performance considerations. + +## Best Practices + +1. **Global Configuration**: Use [`configure_formatter()`][datafusion.dataframe_formatter.configure_formatter] at the beginning of your notebook to set up consistent formatting for all DataFrames. +2. **Memory Management**: Set appropriate `max_memory_bytes` limits to prevent performance issues with large datasets. +3. **Shared Styles**: Keep `use_shared_styles=True` (default) for better performance in notebooks with multiple DataFrames. +4. **Reset When Needed**: Call [`reset_formatter()`][datafusion.dataframe_formatter.reset_formatter] when you want to start fresh with default settings. +5. **Cell Expansion**: Use `enable_cell_expansion=True` when cells might contain longer content that users may want to see in full. + diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst deleted file mode 100644 index dc61a422f..000000000 --- a/docs/source/user-guide/dataframe/rendering.rst +++ /dev/null @@ -1,240 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -DataFrame Rendering -=================== - -DataFusion provides configurable rendering for DataFrames in both plain text and HTML -formats. The ``datafusion.dataframe_formatter`` module controls how DataFrames are -displayed in Jupyter notebooks (via ``_repr_html_``), in the terminal (via ``__repr__``), -and anywhere else a string or HTML representation is needed. - -Basic Rendering ---------------- - -In a Jupyter environment, displaying a DataFrame triggers HTML rendering: - -.. code-block:: python - - # Will display as HTML table in Jupyter - df - - # Explicit display also uses HTML rendering - display(df) - -In a terminal or when converting to string, plain text rendering is used: - -.. code-block:: python - - # Plain text table output - print(df) - -Configuring the Formatter -------------------------- - -You can customize how DataFrames are rendered by configuring the global formatter: - -.. code-block:: python - - from datafusion.dataframe_formatter import configure_formatter - - configure_formatter( - max_cell_length=25, # Maximum characters in a cell before truncation - max_width=1000, # Maximum width in pixels (HTML only) - max_height=300, # Maximum height in pixels (HTML only) - max_memory_bytes=2097152, # Maximum memory for rendering (2MB) - min_rows=10, # Minimum number of rows to display - max_rows=10, # Maximum rows to display - enable_cell_expansion=True, # Allow expanding truncated cells (HTML only) - custom_css=None, # Additional custom CSS (HTML only) - show_truncation_message=True, # Show message when data is truncated - style_provider=None, # Custom styling provider (HTML only) - use_shared_styles=True, # Share styles across tables (HTML only) - ) - -The formatter settings affect all DataFrames displayed after configuration. - -Custom Style Providers ----------------------- - -For HTML styling, you can create a custom style provider that implements the -``StyleProvider`` protocol: - -.. code-block:: python - - from datafusion.dataframe_formatter import configure_formatter - - class MyStyleProvider: - def get_cell_style(self): - """Return CSS style string for table data cells.""" - return "border: 1px solid #ddd; padding: 8px; text-align: left;" - - def get_header_style(self): - """Return CSS style string for table header cells.""" - return ( - "background-color: #007bff; color: white; " - "padding: 8px; text-align: left;" - ) - - # Apply the custom style provider - configure_formatter(style_provider=MyStyleProvider()) - -Custom Cell Formatters ----------------------- - -You can register custom formatters for specific Python types. A cell formatter is any -callable that takes a value and returns a string: - -.. code-block:: python - - from datafusion.dataframe_formatter import get_formatter - - formatter = get_formatter() - - # Format floats to 2 decimal places - formatter.register_formatter(float, lambda v: f"{v:.2f}") - - # Format dates in a custom way - from datetime import date - formatter.register_formatter(date, lambda v: v.strftime("%B %d, %Y")) - -Custom Cell and Header Builders -------------------------------- - -For full control over the HTML of individual cells or headers, you can set custom -builder functions: - -.. code-block:: python - - from datafusion.dataframe_formatter import get_formatter - - formatter = get_formatter() - - # Custom cell builder receives (value, row, col, table_id) and returns HTML - def my_cell_builder(value, row, col, table_id): - color = "red" if isinstance(value, (int, float)) and value < 0 else "black" - return f"{value}" - - formatter.set_custom_cell_builder(my_cell_builder) - - # Custom header builder receives a schema field and returns HTML - def my_header_builder(field): - return f"{field.name}" - - formatter.set_custom_header_builder(my_header_builder) - -Performance Optimization with Shared Styles --------------------------------------------- - -The ``use_shared_styles`` parameter (enabled by default) optimizes performance when -displaying multiple DataFrames in notebook environments: - -.. code-block:: python - - from datafusion.dataframe_formatter import configure_formatter - - # Default: Use shared styles (recommended for notebooks) - configure_formatter(use_shared_styles=True) - - # Disable shared styles (each DataFrame includes its own styles) - configure_formatter(use_shared_styles=False) - -When ``use_shared_styles=True``: - -- CSS styles and JavaScript are included only once per notebook session -- This reduces HTML output size and prevents style duplication -- Improves rendering performance with many DataFrames -- Applies consistent styling across all DataFrames - -Working with the Formatter Directly ------------------------------------- - -You can use ``get_formatter()`` and ``set_formatter()`` for direct access to the global -formatter instance: - -.. code-block:: python - - from datafusion.dataframe_formatter import ( - DataFrameHtmlFormatter, - get_formatter, - set_formatter, - ) - - # Get and modify the current formatter - formatter = get_formatter() - print(formatter.max_rows) - print(formatter.max_cell_length) - - # Create and set a fully custom formatter - custom_formatter = DataFrameHtmlFormatter( - max_cell_length=50, - max_rows=20, - enable_cell_expansion=False, - ) - set_formatter(custom_formatter) - -Reset to default formatting: - -.. code-block:: python - - from datafusion.dataframe_formatter import reset_formatter - - # Reset to default settings - reset_formatter() - -Memory and Display Controls ---------------------------- - -You can control how much data is displayed and how much memory is used for rendering: - -.. code-block:: python - - from datafusion.dataframe_formatter import configure_formatter - - configure_formatter( - max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display - min_rows=20, # Always show at least 20 rows - max_rows=50, # Show up to 50 rows in output - ) - -These parameters help balance comprehensive data display against performance considerations. - -Best Practices --------------- - -1. **Global Configuration**: Use ``configure_formatter()`` at the beginning of your notebook to set up consistent formatting for all DataFrames. - -2. **Memory Management**: Set appropriate ``max_memory_bytes`` limits to prevent performance issues with large datasets. - -3. **Shared Styles**: Keep ``use_shared_styles=True`` (default) for better performance in notebooks with multiple DataFrames. - -4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings. - -5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full. - -Additional Resources --------------------- - -* :doc:`../dataframe/index` - Complete guide to using DataFrames -* :doc:`../io/index` - I/O Guide for reading data from various sources -* :doc:`../data-sources` - Comprehensive data sources guide -* :ref:`io_csv` - CSV file reading -* :ref:`io_parquet` - Parquet file reading -* :ref:`io_json` - JSON file reading -* :ref:`io_avro` - Avro file reading -* :ref:`io_custom_table_provider` - Custom table providers -* `API Reference `_ - Full API reference diff --git a/docs/source/user-guide/distributing-work.md b/docs/source/user-guide/distributing-work.md new file mode 100644 index 000000000..ea9d56e56 --- /dev/null +++ b/docs/source/user-guide/distributing-work.md @@ -0,0 +1,325 @@ + + +# Distributing work + +DataFusion supports splitting work across processes by shipping +serialized expressions to workers: the driver builds an +[`Expr`][datafusion.expr.Expr], each worker evaluates it against its +own slice of data. This pattern suits embarrassingly-parallel +workloads where the driver decides partitioning up front. + +Query-level distribution — where the runtime partitions a single +logical or physical plan across worker nodes — is in progress +upstream via [datafusion-distributed](https://github.com/datafusion-contrib/datafusion-distributed) and [Apache +Ballista](https://datafusion.apache.org/ballista/). Both +have short sections at the end of this page; integration details +will land as those projects become usable from datafusion-python. + +## Expression-level distribution + +DataFusion expressions support distribution directly: pass one to a +worker process and Python's standard +[pickle](https://docs.python.org/3/library/pickle.html) machinery +serializes it transparently — the same machinery +[`map`][multiprocessing.pool.Pool.map], Ray's `@ray.remote`, and +similar libraries already use to ship function arguments. Python UDFs +— scalar, aggregate, and window — travel inside the serialized +expression; the receiver does not need to pre-register them. + +### Basic worker-pool example + +Define a worker function that takes the expression plus a batch and +returns the evaluated result: + +```python +import pyarrow as pa +from datafusion import SessionContext + + +def evaluate(expr, batch): + # `expr` arrived here via the pool's automatic pickling — + # no manual serialization needed in user code. + ctx = SessionContext() + df = ctx.from_pydict({"a": batch}) + return df.with_column("result", expr).select("result").to_pydict()["result"] +``` + +Then build the expression in the driver and fan it out: + +```python +import multiprocessing as mp +from datafusion import col, udf + +double = udf( + lambda arr: pa.array([(v.as_py() or 0) * 2 for v in arr]), + [pa.int64()], pa.int64(), volatility="immutable", name="double", +) +expr = double(col("a")) + +mp_ctx = mp.get_context("forkserver") +with mp_ctx.Pool(processes=4) as pool: + results = pool.starmap( + evaluate, + [(expr, [1, 2, 3]), (expr, [10, 20, 30])], + ) +print(results) # [[2, 4, 6], [20, 40, 60]] +``` + +!!! note + + When saved to a `.py` file and executed with the `spawn` or + `forkserver` start method, wrap the driver block in + `if __name__ == "__main__":` so worker processes can re-import + the module without re-running it. This is a standard Python + [`multiprocessing`][multiprocessing] requirement, not DataFusion-specific — + see [Safe importing of main module](https://docs.python.org/3/library/multiprocessing.html#the-spawn-and-forkserver-start-methods) + in the Python docs. + +### What travels with the expression + +- **Built-in functions** (`abs`, `length`, arithmetic, comparisons, + etc.) — fully portable. Worker needs nothing pre-registered. + +- **Python UDFs** — travel inline (subject to the two portability + requirements below). The callable, its signature, and any state + captured in closures travel inside the serialized expression and are + reconstructed on the worker automatically. Applies equally to: + + - **scalar UDFs** ([`udf`][datafusion.user_defined.udf]) + - **aggregate UDFs** ([`udaf`][datafusion.user_defined.udaf]) + - **window UDFs** ([`udwf`][datafusion.user_defined.udwf]) + +- **UDFs imported via the FFI capsule protocol** — travel **by name + only**. The worker must already have a matching registration on its + [`SessionContext`][datafusion.context.SessionContext]. Without that registration, evaluation + raises an error. + +### Portability requirements for inline Python UDFs + +Inline Python UDFs ride on [cloudpickle](https://github.com/cloudpipe/cloudpickle), which imposes two +requirements on the worker environment: + +- **Matching Python minor version.** cloudpickle serializes Python + bytecode, which is not stable across minor versions. A UDF pickled + on 3.12 cannot be reconstructed on 3.11 or 3.13. The wire format + stamps the sender's `(major, minor)`; mismatches raise a clear + error naming both versions. Align the Python version on driver and + workers. +- **Imported modules must be importable on the worker.** cloudpickle + captures the callable *by value* (bytecode and closure cells travel + whole), but names resolved through `import` are captured *by + reference* — module path only. A UDF doing + `from mylib import transform` requires `mylib` installed on the + worker. Same applies to bound methods of imported classes. + Self-contained UDFs (no imports beyond what the worker already has, + e.g. `pyarrow`) avoid this entirely. + +### Registering shared UDFs on workers + +When an expression references an FFI capsule UDF (or any UDF the +worker must resolve from its registered functions), set up the +worker's [`SessionContext`][datafusion.context.SessionContext] once per process and install it +as the *worker context*: + +```python +from datafusion import SessionContext +from datafusion.ipc import set_worker_ctx + + +def init_worker(): + ctx = SessionContext() + ctx.register_udaf(my_ffi_aggregate) + set_worker_ctx(ctx) + + +with mp.get_context("forkserver").Pool( + processes=4, initializer=init_worker +) as pool: + ... +``` + +Inside a worker, expressions arriving from the driver resolve their +by-name references against the installed worker context. If no worker +context is installed, the global [`SessionContext`][datafusion.context.SessionContext] is used — +fine for expressions that only reference built-ins and Python UDFs, +but FFI-capsule-backed registrations must be installed on the global +context to resolve. + +### Python 3.14 default change + +Python 3.14 changed the Linux default start method for +[`multiprocessing`][multiprocessing] from `fork` to `forkserver` (macOS has +defaulted to `spawn` since Python 3.8; Windows has always used +`spawn`). With `fork`, any state set in the parent was visible in +workers via copy-on-write; with `forkserver` and `spawn` it is +not. The [`set_worker_ctx`][datafusion.ipc.set_worker_ctx] pattern works on +every start method — prefer it over relying on inherited state. + +### Practical considerations + +- **Serialized size scales with what travels inline.** A serialized + expression of just built-ins is small (tens of bytes). An + expression carrying a Python UDF is hundreds of bytes (the callable + and its signature). When the same UDF is shipped many times, + registering an equivalent FFI-capsule UDF on each worker via + [`set_worker_ctx`][datafusion.ipc.set_worker_ctx] and referring to it by + name cuts the per-trip overhead. +- **Closure capture.** When a Python UDF closes over surrounding + state — local variables, module-level objects, file paths — that + state is captured at serialization time. Surprises are possible if + the captured state is large, mutable, or not portable to the + worker's environment. See [Portability requirements for inline + Python UDFs](#portability-requirements-for-inline-python-udfs) for the Python-version and imported-module rules. + +### Disabling Python UDF inlining + +For a stricter wire format, call +[`SessionContext.with_python_udf_inlining(enabled=False)`][datafusion.context.SessionContext.with_python_udf_inlining] on the session +producing or consuming the bytes. With inlining disabled, Python +UDFs travel by name only — the same way FFI-capsule UDFs do — and +the receiver must have a matching registration. + +Two use cases: + +- **Cross-language portability.** A non-Python decoder cannot + reconstruct a cloudpickled payload. Senders aimed at Java, C++, + or another Rust binary disable inlining and rely on the receiver + having compatible UDF registrations. +- **Untrusted-source decode.** With inlining disabled, + [`from_bytes`][datafusion.expr.Expr.from_bytes] never calls `cloudpickle.loads` on + the incoming bytes — an inline payload from a misbehaving sender + raises a clear error instead of executing arbitrary Python code. + +Mismatched configurations raise a descriptive error: an inline blob +fed to a strict receiver fails fast rather than silently dropping +into `cloudpickle.loads`. + +To make the toggle apply through [`dumps`][pickle.dumps] (which +calls [`to_bytes`][datafusion.expr.Expr.to_bytes] with no context), install the strict +session as the driver's *sender context*: + +```python +from datafusion import SessionContext +from datafusion.ipc import set_sender_ctx + +set_sender_ctx(SessionContext().with_python_udf_inlining(enabled=False)) +# Every subsequent pickle.dumps(expr) on this thread encodes +# without inlining the Python callable. +``` + +Pair with a matching strict worker context +([`set_worker_ctx`][datafusion.ipc.set_worker_ctx]) so the `pickle.loads` +side also refuses inline payloads. Explicit +[`Expr.to_bytes(ctx)`][datafusion.expr.Expr.to_bytes] and +[`Expr.from_bytes(blob, ctx=ctx)`][datafusion.expr.Expr.from_bytes] calls +honor the supplied `ctx` directly and ignore the sender / worker +contexts. + +The toggle only narrows the [`from_bytes`][datafusion.expr.Expr.from_bytes] surface; +[`loads`][pickle.loads] on untrusted bytes remains unsafe regardless +of this setting. See the [Security] section below for the full +threat model. + +### Security + +!!! warning + + Reconstructing an expression containing a Python UDF executes + arbitrary Python code on the receiver — pickle is doing the work + under the hood and pickle is unsafe on untrusted input (see the + [pickle module security warning](https://docs.python.org/3/library/pickle.html#module-pickle) + in the Python standard library docs). Only accept expressions + from trusted sources. For untrusted-source workflows, disable + Python UDF inlining (see above), restrict senders to built-in + functions and pre-registered Rust-side UDFs, and avoid + [`loads`][pickle.loads] on externally supplied bytes entirely. + +### Reference: session context slots + +There is only one type — [`SessionContext`][datafusion.context.SessionContext]. It can occupy +up to four *slots* in a running program: + +| Slot | Lifetime | Purpose | Set how | +|------|----------|---------|---------| +| User-held | Local variable / attribute | Build and run queries | `ctx = SessionContext(...)` | +| Global | Process singleton (lazy-init) | Backs module-level [`read_parquet`][datafusion.io.read_parquet], [`read_csv`][datafusion.io.read_csv], [`read_json`][datafusion.io.read_json], [`read_avro`][datafusion.io.read_avro]; final fallback for [`Expr.from_bytes`][datafusion.expr.Expr.from_bytes] | Implicit; access via [`global_ctx`][datafusion.context.SessionContext.global_ctx] | +| Sender | Thread-local on the driver | Codec settings for outbound `pickle.dumps` / [`Expr.to_bytes`][datafusion.expr.Expr.to_bytes] without `ctx` | [`set_sender_ctx`][datafusion.ipc.set_sender_ctx] | +| Worker | Thread-local on the worker | Function registry for inbound `pickle.loads` / [`Expr.from_bytes`][datafusion.expr.Expr.from_bytes] without `ctx` | [`set_worker_ctx`][datafusion.ipc.set_worker_ctx] | + +The same [`SessionContext`][datafusion.context.SessionContext] object may occupy more than one +slot simultaneously — installing it into a slot is a reference, not +a copy. A non-distributed program only ever uses the user-held slot; +the global slot is invisible unless you call top-level `read_*` +helpers. + +Resolution order on the worker side is *explicit argument → +worker context → global context.* Explicit `ctx=` on +[`from_bytes`][datafusion.expr.Expr.from_bytes] always wins; the sender slot is ignored +on decode and the worker slot is ignored on encode. + +Sharp edges: + +- Sender and worker slots are **thread-local**. Background threads + on either side see `None` until they install their own. +- Under the `fork` start method, the parent's `threading.local()` + values are copied into the child by copy-on-write — a forked + worker initially observes whatever sender / worker slot the parent + had set, until the worker writes its own value (or calls the + matching `clear_*_ctx`). `spawn` and `forkserver` workers + start with empty thread-local slots. Treat the slot as + uninitialized on worker entry and install (or clear) it explicitly + in the worker initializer; do not rely on inherited state. +- The global slot persists across `fork` workers (copy-on-write + memory inherit) but not across `spawn` / `forkserver` workers + (fresh process — register or install a worker context on + start-up). +- The inlining toggle is per-context state, not a global switch. + Two contexts with different toggles can coexist in one process. + +## Query-level distribution via datafusion-distributed + +🚧 *Work in progress upstream — not yet usable from datafusion-python.* + +[datafusion-distributed](https://github.com/datafusion-contrib/datafusion-distributed) +splits a single physical plan into stages and runs each stage on a +different worker node. The driver writes a SQL or DataFrame query +once; the runtime handles partitioning, shuffles, and reassembly. + +A datafusion-python integration is in development. This section will +document the integration once it lands. In the meantime, the +expression-level approach above covers most use cases that do not +require automatic plan partitioning. + +## Query-level distribution via Apache Ballista + +🚧 *Work in progress upstream — not yet usable from datafusion-python.* + +[Apache Ballista](https://datafusion.apache.org/ballista/) +provides distributed query execution on top of DataFusion with a +scheduler / executor model better suited to long-lived cluster +deployments. A datafusion-python integration is on the roadmap; this +section will fill in once the integration is usable. + +## See also + +- [`ipc`][datafusion.ipc] — worker context API. +- [`examples/`](https://github.com/apache/datafusion-python/tree/main/examples) — + runnable scripts for `multiprocessing.Pool` and Ray actor patterns, + plus other end-to-end demos. diff --git a/docs/source/user-guide/distributing-work.rst b/docs/source/user-guide/distributing-work.rst deleted file mode 100644 index 03b5ca0b9..000000000 --- a/docs/source/user-guide/distributing-work.rst +++ /dev/null @@ -1,368 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Distributing work -================= - -DataFusion supports splitting work across processes by shipping -serialized expressions to workers: the driver builds an -:py:class:`~datafusion.Expr`, each worker evaluates it against its -own slice of data. This pattern suits embarrassingly-parallel -workloads where the driver decides partitioning up front. - -Query-level distribution — where the runtime partitions a single -logical or physical plan across worker nodes — is in progress -upstream via `datafusion-distributed -`_ and `Apache -Ballista `_. Both -have short sections at the end of this page; integration details -will land as those projects become usable from datafusion-python. - -Expression-level distribution ------------------------------ - -DataFusion expressions support distribution directly: pass one to a -worker process and Python's standard -`pickle `_ machinery -serializes it transparently — the same machinery -:py:meth:`multiprocessing.pool.Pool.map`, Ray's ``@ray.remote``, and -similar libraries already use to ship function arguments. Python UDFs -— scalar, aggregate, and window — travel inside the serialized -expression; the receiver does not need to pre-register them. - -Basic worker-pool example -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Define a worker function that takes the expression plus a batch and -returns the evaluated result: - -.. code-block:: python - - import pyarrow as pa - from datafusion import SessionContext - - - def evaluate(expr, batch): - # `expr` arrived here via the pool's automatic pickling — - # no manual serialization needed in user code. - ctx = SessionContext() - df = ctx.from_pydict({"a": batch}) - return df.with_column("result", expr).select("result").to_pydict()["result"] - -Then build the expression in the driver and fan it out: - -.. code-block:: python - - import multiprocessing as mp - from datafusion import col, udf - - double = udf( - lambda arr: pa.array([(v.as_py() or 0) * 2 for v in arr]), - [pa.int64()], pa.int64(), volatility="immutable", name="double", - ) - expr = double(col("a")) - - mp_ctx = mp.get_context("forkserver") - with mp_ctx.Pool(processes=4) as pool: - results = pool.starmap( - evaluate, - [(expr, [1, 2, 3]), (expr, [10, 20, 30])], - ) - print(results) # [[2, 4, 6], [20, 40, 60]] - -.. note:: - - When saved to a ``.py`` file and executed with the ``spawn`` or - ``forkserver`` start method, wrap the driver block in - ``if __name__ == "__main__":`` so worker processes can re-import - the module without re-running it. This is a standard Python - :py:mod:`multiprocessing` requirement, not DataFusion-specific — - see `Safe importing of main module - `_ - in the Python docs. - - -What travels with the expression -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -* **Built-in functions** (``abs``, ``length``, arithmetic, comparisons, - etc.) — fully portable. Worker needs nothing pre-registered. -* **Python UDFs** — travel inline (subject to the two portability - requirements below). The callable, its signature, and any state - captured in closures travel inside the serialized expression and are - reconstructed on the worker automatically. Applies equally to: - - * **scalar UDFs** (:py:func:`datafusion.udf`) - * **aggregate UDFs** (:py:func:`datafusion.udaf`) - * **window UDFs** (:py:func:`datafusion.udwf`) -* **UDFs imported via the FFI capsule protocol** — travel **by name - only**. The worker must already have a matching registration on its - :py:class:`SessionContext`. Without that registration, evaluation - raises an error. - -Portability requirements for inline Python UDFs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Inline Python UDFs ride on `cloudpickle -`_, which imposes two -requirements on the worker environment: - -* **Matching Python minor version.** cloudpickle serializes Python - bytecode, which is not stable across minor versions. A UDF pickled - on 3.12 cannot be reconstructed on 3.11 or 3.13. The wire format - stamps the sender's ``(major, minor)``; mismatches raise a clear - error naming both versions. Align the Python version on driver and - workers. -* **Imported modules must be importable on the worker.** cloudpickle - captures the callable *by value* (bytecode and closure cells travel - whole), but names resolved through ``import`` are captured *by - reference* — module path only. A UDF doing - ``from mylib import transform`` requires ``mylib`` installed on the - worker. Same applies to bound methods of imported classes. - Self-contained UDFs (no imports beyond what the worker already has, - e.g. ``pyarrow``) avoid this entirely. - -Registering shared UDFs on workers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -When an expression references an FFI capsule UDF (or any UDF the -worker must resolve from its registered functions), set up the -worker's :py:class:`SessionContext` once per process and install it -as the *worker context*: - -.. code-block:: python - - from datafusion import SessionContext - from datafusion.ipc import set_worker_ctx - - - def init_worker(): - ctx = SessionContext() - ctx.register_udaf(my_ffi_aggregate) - set_worker_ctx(ctx) - - - with mp.get_context("forkserver").Pool( - processes=4, initializer=init_worker - ) as pool: - ... - -Inside a worker, expressions arriving from the driver resolve their -by-name references against the installed worker context. If no worker -context is installed, the global :py:class:`SessionContext` is used — -fine for expressions that only reference built-ins and Python UDFs, -but FFI-capsule-backed registrations must be installed on the global -context to resolve. - -Python 3.14 default change -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Python 3.14 changed the Linux default start method for -:py:mod:`multiprocessing` from ``fork`` to ``forkserver`` (macOS has -defaulted to ``spawn`` since Python 3.8; Windows has always used -``spawn``). With ``fork``, any state set in the parent was visible in -workers via copy-on-write; with ``forkserver`` and ``spawn`` it is -not. The :py:func:`~datafusion.ipc.set_worker_ctx` pattern works on -every start method — prefer it over relying on inherited state. - -Practical considerations -~~~~~~~~~~~~~~~~~~~~~~~~ - -* **Serialized size scales with what travels inline.** A serialized - expression of just built-ins is small (tens of bytes). An - expression carrying a Python UDF is hundreds of bytes (the callable - and its signature). When the same UDF is shipped many times, - registering an equivalent FFI-capsule UDF on each worker via - :py:func:`~datafusion.ipc.set_worker_ctx` and referring to it by - name cuts the per-trip overhead. -* **Closure capture.** When a Python UDF closes over surrounding - state — local variables, module-level objects, file paths — that - state is captured at serialization time. Surprises are possible if - the captured state is large, mutable, or not portable to the - worker's environment. See `Portability requirements for inline - Python UDFs`_ for the Python-version and imported-module rules. - -Disabling Python UDF inlining -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -For a stricter wire format, call -:py:meth:`SessionContext.with_python_udf_inlining(enabled=False) -` on the session -producing or consuming the bytes. With inlining disabled, Python -UDFs travel by name only — the same way FFI-capsule UDFs do — and -the receiver must have a matching registration. - -Two use cases: - -* **Cross-language portability.** A non-Python decoder cannot - reconstruct a cloudpickled payload. Senders aimed at Java, C++, - or another Rust binary disable inlining and rely on the receiver - having compatible UDF registrations. -* **Untrusted-source decode.** With inlining disabled, - :py:meth:`Expr.from_bytes` never calls ``cloudpickle.loads`` on - the incoming bytes — an inline payload from a misbehaving sender - raises a clear error instead of executing arbitrary Python code. - -Mismatched configurations raise a descriptive error: an inline blob -fed to a strict receiver fails fast rather than silently dropping -into ``cloudpickle.loads``. - -To make the toggle apply through :py:func:`pickle.dumps` (which -calls :py:meth:`Expr.to_bytes` with no context), install the strict -session as the driver's *sender context*: - -.. code-block:: python - - from datafusion import SessionContext - from datafusion.ipc import set_sender_ctx - - set_sender_ctx(SessionContext().with_python_udf_inlining(enabled=False)) - # Every subsequent pickle.dumps(expr) on this thread encodes - # without inlining the Python callable. - -Pair with a matching strict worker context -(:py:func:`~datafusion.ipc.set_worker_ctx`) so the ``pickle.loads`` -side also refuses inline payloads. Explicit -:py:meth:`Expr.to_bytes(ctx) ` and -:py:meth:`Expr.from_bytes(blob, ctx=ctx) ` calls -honor the supplied ``ctx`` directly and ignore the sender / worker -contexts. - -The toggle only narrows the :py:meth:`Expr.from_bytes` surface; -:py:func:`pickle.loads` on untrusted bytes remains unsafe regardless -of this setting. See the `Security`_ section below for the full -threat model. - -Security -~~~~~~~~ - -.. warning:: - - Reconstructing an expression containing a Python UDF executes - arbitrary Python code on the receiver — pickle is doing the work - under the hood and pickle is unsafe on untrusted input (see the - `pickle module security warning - `_ - in the Python standard library docs). Only accept expressions - from trusted sources. For untrusted-source workflows, disable - Python UDF inlining (see above), restrict senders to built-in - functions and pre-registered Rust-side UDFs, and avoid - :py:func:`pickle.loads` on externally supplied bytes entirely. - -Reference: session context slots -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There is only one type — :py:class:`SessionContext`. It can occupy -up to four *slots* in a running program: - -.. list-table:: - :header-rows: 1 - :widths: 12 18 40 30 - - * - Slot - - Lifetime - - Purpose - - Set how - * - User-held - - Local variable / attribute - - Build and run queries - - ``ctx = SessionContext(...)`` - * - Global - - Process singleton (lazy-init) - - Backs module-level - :py:func:`~datafusion.io.read_parquet`, - :py:func:`~datafusion.io.read_csv`, - :py:func:`~datafusion.io.read_json`, - :py:func:`~datafusion.io.read_avro`; final fallback for - :py:meth:`Expr.from_bytes` - - Implicit; access via - :py:meth:`SessionContext.global_ctx` - * - Sender - - Thread-local on the driver - - Codec settings for outbound :py:func:`pickle.dumps` / - :py:meth:`Expr.to_bytes` without ``ctx`` - - :py:func:`~datafusion.ipc.set_sender_ctx` - * - Worker - - Thread-local on the worker - - Function registry for inbound :py:func:`pickle.loads` / - :py:meth:`Expr.from_bytes` without ``ctx`` - - :py:func:`~datafusion.ipc.set_worker_ctx` - -The same :py:class:`SessionContext` object may occupy more than one -slot simultaneously — installing it into a slot is a reference, not -a copy. A non-distributed program only ever uses the user-held slot; -the global slot is invisible unless you call top-level ``read_*`` -helpers. - -Resolution order on the worker side is *explicit argument → -worker context → global context.* Explicit ``ctx=`` on -:py:meth:`Expr.from_bytes` always wins; the sender slot is ignored -on decode and the worker slot is ignored on encode. - -Sharp edges: - -* Sender and worker slots are **thread-local**. Background threads - on either side see ``None`` until they install their own. -* Under the ``fork`` start method, the parent's ``threading.local()`` - values are copied into the child by copy-on-write — a forked - worker initially observes whatever sender / worker slot the parent - had set, until the worker writes its own value (or calls the - matching ``clear_*_ctx``). ``spawn`` and ``forkserver`` workers - start with empty thread-local slots. Treat the slot as - uninitialized on worker entry and install (or clear) it explicitly - in the worker initializer; do not rely on inherited state. -* The global slot persists across ``fork`` workers (copy-on-write - memory inherit) but not across ``spawn`` / ``forkserver`` workers - (fresh process — register or install a worker context on - start-up). -* The inlining toggle is per-context state, not a global switch. - Two contexts with different toggles can coexist in one process. - -Query-level distribution via datafusion-distributed ---------------------------------------------------- - -🚧 *Work in progress upstream — not yet usable from datafusion-python.* - -`datafusion-distributed `_ -splits a single physical plan into stages and runs each stage on a -different worker node. The driver writes a SQL or DataFrame query -once; the runtime handles partitioning, shuffles, and reassembly. - -A datafusion-python integration is in development. This section will -document the integration once it lands. In the meantime, the -expression-level approach above covers most use cases that do not -require automatic plan partitioning. - -Query-level distribution via Apache Ballista --------------------------------------------- - -🚧 *Work in progress upstream — not yet usable from datafusion-python.* - -`Apache Ballista `_ -provides distributed query execution on top of DataFusion with a -scheduler / executor model better suited to long-lived cluster -deployments. A datafusion-python integration is on the roadmap; this -section will fill in once the integration is usable. - -See also --------- - -* :py:mod:`datafusion.ipc` — worker context API. -* ``examples/multiprocessing_pickle_expr.py`` — runnable - ``multiprocessing.Pool`` example that ships a different parametric - expression to each worker and collects results back. -* ``examples/ray_pickle_expr.py`` — runnable Ray actor example. diff --git a/docs/source/user-guide/index.md b/docs/source/user-guide/index.md new file mode 100644 index 000000000..34a29ea98 --- /dev/null +++ b/docs/source/user-guide/index.md @@ -0,0 +1,48 @@ + + +# User Guide + +The user guide walks through installing DataFusion in Python, building queries +with the DataFrame API or SQL, reading and writing data, and tuning execution. + +## Contents + +- [Introduction](introduction.md) — what DataFusion in Python is and + when to reach for it. +- [Concepts](concepts.md) — `SessionContext`, `DataFrame`, and + `Expr` at a glance. +- [Data Sources](data-sources.md) — reading Parquet / CSV / JSON / + Avro, in-memory DataFrames, object stores, Delta Lake, Iceberg, + custom table providers, and catalogs. +- [DataFrame](dataframe/index.md) — building queries with the DataFrame + API, rendering, and execution metrics. +- [Common Operations](common-operations/index.md) — select, filter, + joins, aggregations, windows, expressions, UDFs/UDAFs. +- [I/O](io/index.md) — per-format reading and writing details. +- [Configuration](configuration.md) — `SessionConfig` / + `RuntimeEnvBuilder` tuning options. +- [Distributing Work](distributing-work.md) — shipping expressions to + worker processes via pickle / cloudpickle, FFI-capsule UDFs, and + the sender/worker context model. +- [SQL](sql.md) — registering tables and running SQL queries. +- [Upgrade Guides](upgrade-guides.md) — notes on cross-version + migrations. +- [AI Coding Assistants](ai-coding-assistants.md) — agent-facing + reference material and skill files. diff --git a/docs/source/user-guide/index.rst b/docs/source/user-guide/index.rst deleted file mode 100644 index 2d6b94392..000000000 --- a/docs/source/user-guide/index.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -========== -User Guide -========== - -The user guide walks through installing DataFusion in Python, building queries -with the DataFrame API or SQL, reading and writing data, and tuning execution. - -.. toctree:: - :maxdepth: 2 - - introduction - basics - data-sources - dataframe/index - common-operations/index - io/index - configuration - distributing-work - sql - upgrade-guides - ai-coding-assistants diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md new file mode 100644 index 000000000..9b70f2fe7 --- /dev/null +++ b/docs/source/user-guide/introduction.md @@ -0,0 +1,73 @@ + + + +# Introduction + +Welcome to the User Guide for the Python bindings of Arrow DataFusion. This guide aims to provide an introduction to +DataFusion through various examples and highlight the most effective ways of using it. + +## Installation + +DataFusion is a Python library and, as such, can be installed via pip from [PyPI](https://pypi.org/project/datafusion). + +```shell +pip install datafusion +``` + +You can verify the installation by running: + +```python exec="1" source="material-block" result="text" session="introduction" +print(datafusion.__version__) +``` + + +In this documentation we will also show some examples for how DataFusion integrates +with Jupyter notebooks. To install and start a Jupyter labs session use + +```shell +pip install jupyterlab +jupyter lab +``` + +To demonstrate working with DataFusion, we need a data source. Later in the tutorial we will show +options for data sources. For our first example, we demonstrate using a Pokemon dataset that you +can download +[here](https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv). + +With that file in place you can use the following python example to view the DataFrame in +DataFusion. + +```python exec="1" source="material-block" result="text" session="introduction" +ctx = SessionContext() + +df = ctx.read_csv("pokemon.csv") + +df.show() +``` + + +If you are working in a Jupyter notebook, you can also use the following to give you a table +display that may be easier to read. + +```python +display(df) +``` + +![Rendered table showing Pokemon DataFrame](../images/jupyter_lab_df_view.png) diff --git a/docs/source/user-guide/introduction.rst b/docs/source/user-guide/introduction.rst deleted file mode 100644 index 7b30ef2b2..000000000 --- a/docs/source/user-guide/introduction.rst +++ /dev/null @@ -1,77 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _guide: - -Introduction -============ - -Welcome to the User Guide for the Python bindings of Arrow DataFusion. This guide aims to provide an introduction to -DataFusion through various examples and highlight the most effective ways of using it. - -Installation ------------- - -DataFusion is a Python library and, as such, can be installed via pip from `PyPI `__. - -.. code-block:: shell - - pip install datafusion - -You can verify the installation by running: - -.. ipython:: python - - import datafusion - datafusion.__version__ - -In this documentation we will also show some examples for how DataFusion integrates -with Jupyter notebooks. To install and start a Jupyter labs session use - -.. code-block:: shell - - pip install jupyterlab - jupyter lab - -To demonstrate working with DataFusion, we need a data source. Later in the tutorial we will show -options for data sources. For our first example, we demonstrate using a Pokemon dataset that you -can download -`here `_. - -With that file in place you can use the following python example to view the DataFrame in -DataFusion. - -.. ipython:: python - - from datafusion import SessionContext - - ctx = SessionContext() - - df = ctx.read_csv("pokemon.csv") - - df.show() - -If you are working in a Jupyter notebook, you can also use the following to give you a table -display that may be easier to read. - -.. code-block:: shell - - display(df) - -.. image:: ../images/jupyter_lab_df_view.png - :width: 800 - :alt: Rendered table showing Pokemon DataFrame diff --git a/docs/source/user-guide/io/arrow.md b/docs/source/user-guide/io/arrow.md new file mode 100644 index 000000000..455f40168 --- /dev/null +++ b/docs/source/user-guide/io/arrow.md @@ -0,0 +1,70 @@ + + +# Arrow + +DataFusion implements the +[Apache Arrow PyCapsule interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) +for importing and exporting DataFrames with zero copy. With this feature, any Python +project that implements this interface can share data back and forth with DataFusion +with zero copy. + +We can demonstrate using [pyarrow](https://arrow.apache.org/docs/python/index.html). + +## Importing to DataFusion + +Here we will create an Arrow table and import it to DataFusion. + +To import an Arrow table, use [`from_arrow`][datafusion.context.SessionContext.from_arrow]. +This will accept any Python object that implements +[\_\_arrow_c_stream\_\_](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowstream-export) +or [\_\_arrow_c_array\_\_](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export) +and returns a `StructArray`. Common pyarrow sources you can use are: + +- [Array](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html) (but it must return a Struct Array) +- [Record Batch](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html) +- [Record Batch Reader](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html) +- [Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html) + +```python exec="1" source="material-block" result="text" session="arrow" +import pyarrow as pa + +data = {"a": [1, 2, 3], "b": [4, 5, 6]} +table = pa.Table.from_pydict(data) + +ctx = SessionContext() +df = ctx.from_arrow(table) +print(df) +``` + + +## Exporting from DataFusion + +DataFusion DataFrames implement `__arrow_c_stream__` PyCapsule interface, so any +Python library that accepts these can import a DataFusion DataFrame directly. + +Invoking `__arrow_c_stream__` triggers execution of the underlying query, but +batches are yielded incrementally rather than materialized all at once in memory. +Consumers can process the stream as it arrives. The stream executes lazily, +letting downstream readers pull batches on demand. + +```python exec="1" source="material-block" result="text" session="arrow" +df = df.select((col("a") * lit(1.5)).alias("c"), lit("df").alias("d")) +print(pa.table(df)) +``` diff --git a/docs/source/user-guide/io/arrow.rst b/docs/source/user-guide/io/arrow.rst deleted file mode 100644 index 9196fcea7..000000000 --- a/docs/source/user-guide/io/arrow.rst +++ /dev/null @@ -1,75 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Arrow -===== - -DataFusion implements the -`Apache Arrow PyCapsule interface `_ -for importing and exporting DataFrames with zero copy. With this feature, any Python -project that implements this interface can share data back and forth with DataFusion -with zero copy. - -We can demonstrate using `pyarrow `_. - -Importing to DataFusion ------------------------ - -Here we will create an Arrow table and import it to DataFusion. - -To import an Arrow table, use :py:func:`datafusion.context.SessionContext.from_arrow`. -This will accept any Python object that implements -`__arrow_c_stream__ `_ -or `__arrow_c_array__ `_ -and returns a ``StructArray``. Common pyarrow sources you can use are: - -- `Array `_ (but it must return a Struct Array) -- `Record Batch `_ -- `Record Batch Reader `_ -- `Table `_ - -.. ipython:: python - - from datafusion import SessionContext - import pyarrow as pa - - data = {"a": [1, 2, 3], "b": [4, 5, 6]} - table = pa.Table.from_pydict(data) - - ctx = SessionContext() - df = ctx.from_arrow(table) - df - -Exporting from DataFusion -------------------------- - -DataFusion DataFrames implement ``__arrow_c_stream__`` PyCapsule interface, so any -Python library that accepts these can import a DataFusion DataFrame directly. - -Invoking ``__arrow_c_stream__`` triggers execution of the underlying query, but -batches are yielded incrementally rather than materialized all at once in memory. -Consumers can process the stream as it arrives. The stream executes lazily, -letting downstream readers pull batches on demand. - - -.. ipython:: python - - from datafusion import col, lit - - df = df.select((col("a") * lit(1.5)).alias("c"), lit("df").alias("d")) - pa.table(df) - diff --git a/docs/source/user-guide/io/avro.md b/docs/source/user-guide/io/avro.md new file mode 100644 index 000000000..62c7c94d0 --- /dev/null +++ b/docs/source/user-guide/io/avro.md @@ -0,0 +1,31 @@ + + + +# Avro + +[Avro](https://avro.apache.org/) is a serialization format for record data. Reading an avro file is very straightforward +with [`read_avro`][datafusion.context.SessionContext.read_avro] + +```python +from datafusion import SessionContext + +ctx = SessionContext() +df = ctx.read_avro("file.avro") +``` diff --git a/docs/source/user-guide/io/avro.rst b/docs/source/user-guide/io/avro.rst deleted file mode 100644 index 66398ac7f..000000000 --- a/docs/source/user-guide/io/avro.rst +++ /dev/null @@ -1,32 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _io_avro: - -Avro -==== - -`Avro `_ is a serialization format for record data. Reading an avro file is very straightforward -with :py:func:`~datafusion.context.SessionContext.read_avro` - -.. code-block:: python - - - from datafusion import SessionContext - - ctx = SessionContext() - df = ctx.read_avro("file.avro") \ No newline at end of file diff --git a/docs/source/user-guide/io/csv.md b/docs/source/user-guide/io/csv.md new file mode 100644 index 000000000..0fa1369fa --- /dev/null +++ b/docs/source/user-guide/io/csv.md @@ -0,0 +1,59 @@ + + + +# CSV + +Reading a csv is very straightforward with [`read_csv`][datafusion.context.SessionContext.read_csv] + +```python +from datafusion import SessionContext + +ctx = SessionContext() +df = ctx.read_csv("file.csv") +``` + +An alternative is to use [`register_csv`][datafusion.context.SessionContext.register_csv] + +```python +ctx.register_csv("file", "file.csv") +df = ctx.table("file") +``` + +If you require additional control over how to read the CSV file, you can use +[`CsvReadOptions`][datafusion.options.CsvReadOptions] to set a variety of options. + +```python +from datafusion import CsvReadOptions +options = ( + CsvReadOptions() + .with_has_header(True) # File contains a header row + .with_delimiter(";") # Use ; as the delimiter instead of , + .with_comment("#") # Skip lines starting with # + .with_escape("\\") # Escape character + .with_null_regex(r"^(null|NULL|N/A)$") # Treat these as NULL + .with_truncated_rows(True) # Allow rows to have incomplete columns + .with_file_compression_type("gzip") # Read gzipped CSV + .with_file_extension(".gz") # File extension other than .csv +) +df = ctx.read_csv("data.csv.gz", options=options) +``` + +Details for all CSV reading options can be found on the +[DataFusion documentation site](https://datafusion.apache.org/library-user-guide/custom-table-providers.html). diff --git a/docs/source/user-guide/io/csv.rst b/docs/source/user-guide/io/csv.rst deleted file mode 100644 index 9c23c291b..000000000 --- a/docs/source/user-guide/io/csv.rst +++ /dev/null @@ -1,60 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _io_csv: - -CSV -=== - -Reading a csv is very straightforward with :py:func:`~datafusion.context.SessionContext.read_csv` - -.. code-block:: python - - - from datafusion import SessionContext - - ctx = SessionContext() - df = ctx.read_csv("file.csv") - -An alternative is to use :py:func:`~datafusion.context.SessionContext.register_csv` - -.. code-block:: python - - ctx.register_csv("file", "file.csv") - df = ctx.table("file") - -If you require additional control over how to read the CSV file, you can use -:py:class:`~datafusion.options.CsvReadOptions` to set a variety of options. - -.. code-block:: python - - from datafusion import CsvReadOptions - options = ( - CsvReadOptions() - .with_has_header(True) # File contains a header row - .with_delimiter(";") # Use ; as the delimiter instead of , - .with_comment("#") # Skip lines starting with # - .with_escape("\\") # Escape character - .with_null_regex(r"^(null|NULL|N/A)$") # Treat these as NULL - .with_truncated_rows(True) # Allow rows to have incomplete columns - .with_file_compression_type("gzip") # Read gzipped CSV - .with_file_extension(".gz") # File extension other than .csv - ) - df = ctx.read_csv("data.csv.gz", options=options) - -Details for all CSV reading options can be found on the -`DataFusion documentation site `_. diff --git a/docs/source/user-guide/io/index.md b/docs/source/user-guide/io/index.md new file mode 100644 index 000000000..5aa9e3992 --- /dev/null +++ b/docs/source/user-guide/io/index.md @@ -0,0 +1,45 @@ + + +# IO + +DataFusion can read and write a range of file formats and stream data in +through Arrow-compatible Python objects. + +## File formats + +| Format | Reader | Notes | +|---|---|---| +| [Apache Arrow](arrow.md) | [`SessionContext.read_arrow`][datafusion.context.SessionContext.read_arrow] | Single Arrow IPC file. | +| [Avro](avro.md) | [`SessionContext.read_avro`][datafusion.context.SessionContext.read_avro] | Schema-on-read; requires the Avro feature in the wheel. | +| [CSV](csv.md) | [`SessionContext.read_csv`][datafusion.context.SessionContext.read_csv] | Header inference, custom delimiters, gzip/bz2 compression. | +| [JSON](json.md) | [`SessionContext.read_json`][datafusion.context.SessionContext.read_json] | Newline-delimited JSON; one record per line. | +| [Parquet](parquet.md) | [`SessionContext.read_parquet`][datafusion.context.SessionContext.read_parquet] | Predicate / projection push-down, partitioned datasets. | + +## Custom sources + +- [Table Provider](table_provider.md) — register an arbitrary data source + (Delta Lake, Iceberg, your own Rust crate, etc.) by implementing the + table-provider FFI interface. + +## See also + +- [Data Sources](../data-sources.md) — concept overview, including + in-memory DataFrame creation from `pyarrow` / `pandas` / `polars` and + object-store integration. diff --git a/docs/source/user-guide/io/index.rst b/docs/source/user-guide/io/index.rst deleted file mode 100644 index b885cfeda..000000000 --- a/docs/source/user-guide/io/index.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -IO -== - -.. toctree:: - :maxdepth: 2 - - arrow - avro - csv - json - parquet - table_provider diff --git a/docs/source/user-guide/io/json.md b/docs/source/user-guide/io/json.md new file mode 100644 index 000000000..0b5d8d9d8 --- /dev/null +++ b/docs/source/user-guide/io/json.md @@ -0,0 +1,31 @@ + + + +# JSON + +[JSON](https://www.json.org/json-en.html) (JavaScript Object Notation) is a lightweight data-interchange format. +When it comes to reading a JSON file, using [`read_json`][datafusion.context.SessionContext.read_json] is a simple and easy + +```python +from datafusion import SessionContext + +ctx = SessionContext() +df = ctx.read_json("file.json") +``` diff --git a/docs/source/user-guide/io/json.rst b/docs/source/user-guide/io/json.rst deleted file mode 100644 index 39030db7f..000000000 --- a/docs/source/user-guide/io/json.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _io_json: - -JSON -==== -`JSON `_ (JavaScript Object Notation) is a lightweight data-interchange format. -When it comes to reading a JSON file, using :py:func:`~datafusion.context.SessionContext.read_json` is a simple and easy - -.. code-block:: python - - - from datafusion import SessionContext - - ctx = SessionContext() - df = ctx.read_json("file.json") diff --git a/docs/source/user-guide/io/parquet.md b/docs/source/user-guide/io/parquet.md new file mode 100644 index 000000000..1e6e4a18e --- /dev/null +++ b/docs/source/user-guide/io/parquet.md @@ -0,0 +1,37 @@ + + + +# Parquet + +It is quite simple to read a parquet file using the [`read_parquet`][datafusion.context.SessionContext.read_parquet] function. + +```python +from datafusion import SessionContext + +ctx = SessionContext() +df = ctx.read_parquet("file.parquet") +``` + +An alternative is to use [`register_parquet`][datafusion.context.SessionContext.register_parquet] + +```python +ctx.register_parquet("file", "file.parquet") +df = ctx.table("file") +``` diff --git a/docs/source/user-guide/io/parquet.rst b/docs/source/user-guide/io/parquet.rst deleted file mode 100644 index c5b9ca3d4..000000000 --- a/docs/source/user-guide/io/parquet.rst +++ /dev/null @@ -1,37 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _io_parquet: - -Parquet -======= - -It is quite simple to read a parquet file using the :py:func:`~datafusion.context.SessionContext.read_parquet` function. - -.. code-block:: python - - from datafusion import SessionContext - - ctx = SessionContext() - df = ctx.read_parquet("file.parquet") - -An alternative is to use :py:func:`~datafusion.context.SessionContext.register_parquet` - -.. code-block:: python - - ctx.register_parquet("file", "file.parquet") - df = ctx.table("file") diff --git a/docs/source/user-guide/io/table_provider.md b/docs/source/user-guide/io/table_provider.md new file mode 100644 index 000000000..375f0b8ed --- /dev/null +++ b/docs/source/user-guide/io/table_provider.md @@ -0,0 +1,62 @@ + + + +# Custom Table Provider + +If you have a custom data source that you want to integrate with DataFusion, you can do so by +implementing the [TableProvider](https://datafusion.apache.org/library-user-guide/custom-table-providers.html) +interface in Rust and then exposing it in Python. To do so, +you must use DataFusion 43.0.0 or later and expose a [FFI_TableProvider](https://crates.io/crates/datafusion-ffi) +via [PyCapsule](https://pyo3.rs/main/doc/pyo3/types/struct.pycapsule). + +A complete example can be found in the [examples folder](https://github.com/apache/datafusion-python/tree/main/examples). + +```rust +#[pymethods] +impl MyTableProvider { + + fn __datafusion_table_provider__<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + let name = cr"datafusion_table_provider".into(); + + let provider = Arc::new(self.clone()); + let provider = FFI_TableProvider::new(provider, false, None); + + PyCapsule::new_bound(py, provider, Some(name.clone())) + } +} +``` + +Once you have this library available, you can construct a +[`Table`][datafusion.catalog.Table] in Python and register it with the +`SessionContext`. + +```python +from datafusion import SessionContext, Table + +ctx = SessionContext() +provider = MyTableProvider() + +ctx.register_table("capsule_table", provider) + +ctx.table("capsule_table").show() +``` diff --git a/docs/source/user-guide/io/table_provider.rst b/docs/source/user-guide/io/table_provider.rst deleted file mode 100644 index 29e5d9880..000000000 --- a/docs/source/user-guide/io/table_provider.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -.. _io_custom_table_provider: - -Custom Table Provider -===================== - -If you have a custom data source that you want to integrate with DataFusion, you can do so by -implementing the `TableProvider `_ -interface in Rust and then exposing it in Python. To do so, -you must use DataFusion 43.0.0 or later and expose a `FFI_TableProvider `_ -via `PyCapsule `_. - -A complete example can be found in the `examples folder `_. - -.. code-block:: rust - - #[pymethods] - impl MyTableProvider { - - fn __datafusion_table_provider__<'py>( - &self, - py: Python<'py>, - ) -> PyResult> { - let name = cr"datafusion_table_provider".into(); - - let provider = Arc::new(self.clone()); - let provider = FFI_TableProvider::new(provider, false, None); - - PyCapsule::new_bound(py, provider, Some(name.clone())) - } - } - -Once you have this library available, you can construct a -:py:class:`~datafusion.Table` in Python and register it with the -``SessionContext``. - -.. code-block:: python - - from datafusion import SessionContext, Table - - ctx = SessionContext() - provider = MyTableProvider() - - ctx.register_table("capsule_table", provider) - - ctx.table("capsule_table").show() diff --git a/docs/source/user-guide/sql.md b/docs/source/user-guide/sql.md new file mode 100644 index 000000000..2409c5410 --- /dev/null +++ b/docs/source/user-guide/sql.md @@ -0,0 +1,133 @@ + + +# SQL + +DataFusion also offers a SQL API, read the full reference [here](https://arrow.apache.org/datafusion/user-guide/sql/index.html) + +```python exec="1" source="material-block" result="text" session="sql" +from datafusion import DataFrame + +# create a context +ctx = datafusion.SessionContext() + +# register a CSV +ctx.register_csv("pokemon", "pokemon.csv") + +# create a new statement via SQL +df = ctx.sql('SELECT "Attack"+"Defense", "Attack"-"Defense" FROM pokemon') + +# collect and convert to pandas DataFrame +print(df.to_pandas()) +``` + + +## Parameterized queries + +In DataFusion-Python 51.0.0 we introduced the ability to pass parameters +in a SQL query. These are similar in concept to +[prepared statements](https://datafusion.apache.org/user-guide/sql/prepared_statements.html), +but allow passing named parameters into a SQL query. Consider this simple +example. + +```python exec="1" source="material-block" result="text" session="sql" +def show_attacks(ctx: SessionContext, threshold: int) -> None: + ctx.sql( + 'SELECT "Name", "Attack" FROM pokemon WHERE "Attack" > $val', val=threshold + ).show(num=5) + + +print(show_attacks(ctx, 75)) +``` + + +When passing parameters like the example above we convert the Python objects +into their string representation. We also have special case handling +for [`DataFrame`][datafusion.dataframe.DataFrame] objects, since they cannot simply +be turned into string representations for an SQL query. In these cases we +will register a temporary view in the [`SessionContext`][datafusion.context.SessionContext] +using a generated table name. + +The formatting for passing string replacement objects is to precede the +variable name with a single `$`. This works for all dialects in +the SQL parser except `hive` and `mysql`. Since these dialects do not +support named placeholders, we are unable to do this type of replacement. +We recommend either switching to another dialect or using Python +f-string style replacement. + +
+

Warning

+ +To support DataFrame parameterized queries, your session must support +registration of temporary views. The default +[`CatalogProvider`][datafusion.catalog.CatalogProvider] and +[`SchemaProvider`][datafusion.catalog.SchemaProvider] do have this capability. +If you have implemented custom providers, it is important that temporary +views do not persist across [`SessionContext`][datafusion.context.SessionContext] +or you may get unintended consequences. + +
+ +The following example shows passing in both a [`DataFrame`][datafusion.dataframe.DataFrame] +object as well as a Python object to be used in parameterized replacement. + +```python exec="1" source="material-block" result="text" session="sql" +def show_column( + ctx: SessionContext, column: str, df: DataFrame, threshold: int +) -> None: + ctx.sql( + 'SELECT "Name", $col FROM $df WHERE $col > $val', + col=column, + df=df, + val=threshold, + ).show(num=5) + + +df = ctx.table("pokemon") +print(show_column(ctx, '"Defense"', df, 75)) +``` + + +The approach implemented for conversion of variables into a SQL query +relies on string conversion. This has the potential for data loss, +specifically for cases like floating point numbers. If you need to pass +variables into a parameterized query and it is important to maintain the +original value without conversion to a string, then you can use the +optional parameter `param_values` to specify these. This parameter +expects a dictionary mapping from the parameter name to a Python +object. Those objects will be cast into a +[PyArrow Scalar Value](https://arrow.apache.org/docs/python/generated/pyarrow.Scalar.html). + +Using `param_values` will rely on the SQL dialect you have configured +for your session. This can be set using the [configuration options](configuration.md) +of your [`SessionContext`][datafusion.context.SessionContext]. Similar to how +[prepared statements](https://datafusion.apache.org/user-guide/sql/prepared_statements.html) +work, these parameters are limited to places where you would pass in a +scalar value, such as a comparison. + +```python exec="1" source="material-block" result="text" session="sql" +def param_attacks(ctx: SessionContext, threshold: int) -> None: + ctx.sql( + 'SELECT "Name", "Attack" FROM pokemon WHERE "Attack" > $val', + param_values={"val": threshold}, + ).show(num=5) + + +print(param_attacks(ctx, 75)) +``` diff --git a/docs/source/user-guide/sql.rst b/docs/source/user-guide/sql.rst deleted file mode 100644 index b4bfb9611..000000000 --- a/docs/source/user-guide/sql.rst +++ /dev/null @@ -1,122 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -SQL -=== - -DataFusion also offers a SQL API, read the full reference `here `_ - -.. ipython:: python - - import datafusion - from datafusion import DataFrame, SessionContext - - # create a context - ctx = datafusion.SessionContext() - - # register a CSV - ctx.register_csv("pokemon", "pokemon.csv") - - # create a new statement via SQL - df = ctx.sql('SELECT "Attack"+"Defense", "Attack"-"Defense" FROM pokemon') - - # collect and convert to pandas DataFrame - df.to_pandas() - -Parameterized queries ---------------------- - -In DataFusion-Python 51.0.0 we introduced the ability to pass parameters -in a SQL query. These are similar in concept to -`prepared statements `_, -but allow passing named parameters into a SQL query. Consider this simple -example. - -.. ipython:: python - - def show_attacks(ctx: SessionContext, threshold: int) -> None: - ctx.sql( - 'SELECT "Name", "Attack" FROM pokemon WHERE "Attack" > $val', val=threshold - ).show(num=5) - show_attacks(ctx, 75) - -When passing parameters like the example above we convert the Python objects -into their string representation. We also have special case handling -for :py:class:`~datafusion.dataframe.DataFrame` objects, since they cannot simply -be turned into string representations for an SQL query. In these cases we -will register a temporary view in the :py:class:`~datafusion.context.SessionContext` -using a generated table name. - -The formatting for passing string replacement objects is to precede the -variable name with a single ``$``. This works for all dialects in -the SQL parser except ``hive`` and ``mysql``. Since these dialects do not -support named placeholders, we are unable to do this type of replacement. -We recommend either switching to another dialect or using Python -f-string style replacement. - -.. warning:: - - To support DataFrame parameterized queries, your session must support - registration of temporary views. The default - :py:class:`~datafusion.catalog.CatalogProvider` and - :py:class:`~datafusion.catalog.SchemaProvider` do have this capability. - If you have implemented custom providers, it is important that temporary - views do not persist across :py:class:`~datafusion.context.SessionContext` - or you may get unintended consequences. - -The following example shows passing in both a :py:class:`~datafusion.dataframe.DataFrame` -object as well as a Python object to be used in parameterized replacement. - -.. ipython:: python - - def show_column( - ctx: SessionContext, column: str, df: DataFrame, threshold: int - ) -> None: - ctx.sql( - 'SELECT "Name", $col FROM $df WHERE $col > $val', - col=column, - df=df, - val=threshold, - ).show(num=5) - df = ctx.table("pokemon") - show_column(ctx, '"Defense"', df, 75) - -The approach implemented for conversion of variables into a SQL query -relies on string conversion. This has the potential for data loss, -specifically for cases like floating point numbers. If you need to pass -variables into a parameterized query and it is important to maintain the -original value without conversion to a string, then you can use the -optional parameter ``param_values`` to specify these. This parameter -expects a dictionary mapping from the parameter name to a Python -object. Those objects will be cast into a -`PyArrow Scalar Value `_. - -Using ``param_values`` will rely on the SQL dialect you have configured -for your session. This can be set using the :ref:`configuration options ` -of your :py:class:`~datafusion.context.SessionContext`. Similar to how -`prepared statements `_ -work, these parameters are limited to places where you would pass in a -scalar value, such as a comparison. - -.. ipython:: python - - def param_attacks(ctx: SessionContext, threshold: int) -> None: - ctx.sql( - 'SELECT "Name", "Attack" FROM pokemon WHERE "Attack" > $val', - param_values={"val": threshold}, - ).show(num=5) - param_attacks(ctx, 75) diff --git a/docs/source/user-guide/upgrade-guides.md b/docs/source/user-guide/upgrade-guides.md new file mode 100644 index 000000000..d8c020963 --- /dev/null +++ b/docs/source/user-guide/upgrade-guides.md @@ -0,0 +1,167 @@ + + +# Upgrade Guides + +## DataFusion 54.0.0 + +### `Config` removed in favor of `SessionConfig` + +The `Config` class has been removed. It was a standalone wrapper around +`ConfigOptions` that could not be connected to a `SessionContext`, making it +effectively unusable. Use [`SessionConfig`][datafusion.context.SessionConfig] instead, +which is passed directly to `SessionContext`. + +Before: + +```python +from datafusion import Config + +config = Config() +config.set("datafusion.execution.batch_size", "4096") +# config could not be passed to SessionContext +``` + +After: + +```python +from datafusion import SessionConfig, SessionContext + +config = SessionConfig().set("datafusion.execution.batch_size", "4096") +ctx = SessionContext(config) +``` + +### `distinct` argument added to `sum` and `avg` + +The aggregate functions [`sum`][datafusion.functions.sum] and +[`avg`][datafusion.functions.avg] now accept a `distinct` argument, matching +the other aggregate functions. `distinct` is inserted *before* `filter` in the +argument list, so any code that passed `filter` positionally must be updated to +pass it as a keyword argument. The types are distinct so a type checker should flag this. + +Before: + +```python +f.sum(column("a"), my_filter) +f.avg(column("a"), my_filter) +``` + +Now: + +```python +f.sum(column("a"), filter=my_filter) +f.avg(column("a"), filter=my_filter) +``` + +## DataFusion 53.0.0 + +This version includes an upgraded version of `pyo3`, which changed the way to extract an FFI +object. Example: + +Before: + +```rust +let codec = unsafe { capsule.reference::() }; +``` + +Now: + +```rust +let data: NonNull = capsule + .pointer_checked(Some(c_str!("datafusion_logical_extension_codec")))? + .cast(); +let codec = unsafe { data.as_ref() }; +``` + +## DataFusion 52.0.0 + +This version includes a major update to the [ffi](../contributor-guide/ffi.md) due to upgrades +to the [Foreign Function Interface](https://doc.rust-lang.org/nomicon/ffi.html). +Users who contribute their own `CatalogProvider`, `SchemaProvider`, +`TableProvider` or `TableFunction` via FFI must now provide access to a +`LogicalExtensionCodec` and a `TaskContextProvider`. The function signatures +for the methods to get these `PyCapsule` objects now requires an additional +parameter, which is a Python object that can be used to extract the +`FFI_LogicalExtensionCodec` that is necessary. + +A complete example can be found in the [FFI example](https://github.com/apache/datafusion-python/tree/main/examples/datafusion-ffi-example). +Your FFI hook methods — `__datafusion_catalog_provider__`, +`__datafusion_schema_provider__`, `__datafusion_table_provider__`, and +`__datafusion_table_function__` — need to be updated to accept an additional +`session: Bound` parameter, as shown in this example. + +```rust +#[pymethods] +impl MyCatalogProvider { + pub fn __datafusion_catalog_provider__<'py>( + &self, + py: Python<'py>, + session: Bound, + ) -> PyResult> { + let name = cr"datafusion_catalog_provider".into(); + + let provider = Arc::clone(&self.inner) as Arc; + + let codec = ffi_logical_codec_from_pycapsule(session)?; + let provider = FFI_CatalogProvider::new_with_ffi_codec(provider, None, codec); + + PyCapsule::new(py, provider, Some(name)) + } +} +``` + +To extract the logical extension codec FFI object from the provided object you +can implement a helper method such as: + +```rust +pub(crate) fn ffi_logical_codec_from_pycapsule( + obj: Bound, +) -> PyResult { + let attr_name = "__datafusion_logical_extension_codec__"; + let capsule = if obj.hasattr(attr_name)? { + obj.getattr(attr_name)?.call0()? + } else { + obj + }; + + let capsule = capsule.downcast::()?; + validate_pycapsule(capsule, "datafusion_logical_extension_codec")?; + + let codec = unsafe { capsule.reference::() }; + + Ok(codec.clone()) +} +``` + +The DataFusion FFI interface updates no longer depend directly on the +`datafusion` core crate. You can improve your build times and potentially +reduce your library binary size by removing this dependency and instead +using the specific datafusion project crates. + +For example, instead of including expressions like: + +```rust +use datafusion::catalog::MemTable; +``` + +Instead you can now write: + +```rust +use datafusion_catalog::MemTable; +``` diff --git a/docs/source/user-guide/upgrade-guides.rst b/docs/source/user-guide/upgrade-guides.rst deleted file mode 100644 index 9671594b8..000000000 --- a/docs/source/user-guide/upgrade-guides.rst +++ /dev/null @@ -1,166 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -Upgrade Guides -============== - -DataFusion 54.0.0 ------------------ - -The ``Config`` class has been removed. It was a standalone wrapper around -``ConfigOptions`` that could not be connected to a ``SessionContext``, making it -effectively unusable. Use :py:class:`~datafusion.context.SessionConfig` instead, -which is passed directly to ``SessionContext``. - -Before: - -.. code-block:: python - - from datafusion import Config - - config = Config() - config.set("datafusion.execution.batch_size", "4096") - # config could not be passed to SessionContext - -After: - -.. code-block:: python - - from datafusion import SessionConfig, SessionContext - - config = SessionConfig().set("datafusion.execution.batch_size", "4096") - ctx = SessionContext(config) - -The aggregate functions :py:func:`~datafusion.functions.sum` and -:py:func:`~datafusion.functions.avg` now accept a ``distinct`` argument, matching -the other aggregate functions. ``distinct`` is inserted *before* ``filter`` in the -argument list, so any code that passed ``filter`` positionally must be updated to -pass it as a keyword argument. The types are distinct so a type checker should flag this. - -Before: - -.. code-block:: python - - f.sum(column("a"), my_filter) - f.avg(column("a"), my_filter) - -Now: - -.. code-block:: python - - f.sum(column("a"), filter=my_filter) - f.avg(column("a"), filter=my_filter) - -DataFusion 53.0.0 ------------------ - -This version includes an upgraded version of ``pyo3``, which changed the way to extract an FFI -object. Example: - -Before: - -.. code-block:: rust - - let codec = unsafe { capsule.reference::() }; - -Now: - -.. code-block:: rust - - let data: NonNull = capsule - .pointer_checked(Some(c_str!("datafusion_logical_extension_codec")))? - .cast(); - let codec = unsafe { data.as_ref() }; - -DataFusion 52.0.0 ------------------ - -This version includes a major update to the :ref:`ffi` due to upgrades -to the `Foreign Function Interface `_. -Users who contribute their own ``CatalogProvider``, ``SchemaProvider``, -``TableProvider`` or ``TableFunction`` via FFI must now provide access to a -``LogicalExtensionCodec`` and a ``TaskContextProvider``. The function signatures -for the methods to get these ``PyCapsule`` objects now requires an additional -parameter, which is a Python object that can be used to extract the -``FFI_LogicalExtensionCodec`` that is necessary. - -A complete example can be found in the `FFI example `_. -Your FFI hook methods — ``__datafusion_catalog_provider__``, -``__datafusion_schema_provider__``, ``__datafusion_table_provider__``, and -``__datafusion_table_function__`` — need to be updated to accept an additional -``session: Bound`` parameter, as shown in this example. - -.. code-block:: rust - - #[pymethods] - impl MyCatalogProvider { - pub fn __datafusion_catalog_provider__<'py>( - &self, - py: Python<'py>, - session: Bound, - ) -> PyResult> { - let name = cr"datafusion_catalog_provider".into(); - - let provider = Arc::clone(&self.inner) as Arc; - - let codec = ffi_logical_codec_from_pycapsule(session)?; - let provider = FFI_CatalogProvider::new_with_ffi_codec(provider, None, codec); - - PyCapsule::new(py, provider, Some(name)) - } - } - -To extract the logical extension codec FFI object from the provided object you -can implement a helper method such as: - -.. code-block:: rust - - pub(crate) fn ffi_logical_codec_from_pycapsule( - obj: Bound, - ) -> PyResult { - let attr_name = "__datafusion_logical_extension_codec__"; - let capsule = if obj.hasattr(attr_name)? { - obj.getattr(attr_name)?.call0()? - } else { - obj - }; - - let capsule = capsule.downcast::()?; - validate_pycapsule(capsule, "datafusion_logical_extension_codec")?; - - let codec = unsafe { capsule.reference::() }; - - Ok(codec.clone()) - } - - -The DataFusion FFI interface updates no longer depend directly on the -``datafusion`` core crate. You can improve your build times and potentially -reduce your library binary size by removing this dependency and instead -using the specific datafusion project crates. - -For example, instead of including expressions like: - -.. code-block:: rust - - use datafusion::catalog::MemTable; - -Instead you can now write: - -.. code-block:: rust - - use datafusion_catalog::MemTable; diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..6e0e50d28 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,179 @@ +site_name: Apache DataFusion in Python +site_description: Python bindings for Apache DataFusion query engine +site_url: https://datafusion.apache.org/python/ +repo_url: https://github.com/apache/datafusion-python +repo_name: apache/datafusion-python +copyright: Copyright 2019-2026, Apache Software Foundation + +docs_dir: docs/source +site_dir: docs/build/html +strict: true + +theme: + name: material + custom_dir: docs/source/_overrides + logo: _static/images/original.svg + favicon: _static/favicon.svg + features: + - navigation.sections + - navigation.indexes + - navigation.top + - navigation.instant + - navigation.tracking + - content.code.copy + - content.code.annotate + - search.highlight + - search.suggest + - toc.follow + palette: + - media: "(prefers-color-scheme: light)" + scheme: default + primary: white + accent: custom + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + accent: custom + toggle: + icon: material/brightness-4 + name: Switch to light mode + icon: + repo: fontawesome/brands/github + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/apache/datafusion-python + - icon: fontawesome/brands/rust + link: https://docs.rs/datafusion/latest/datafusion/ + name: Rust API docs (docs.rs) + +extra_css: + - _static/theme_overrides.css + +plugins: + - search + - markdown-exec + - mkdocstrings: + default_handler: python + handlers: + python: + paths: [python] + inventories: + - https://docs.python.org/3/objects.inv + - https://arrow.apache.org/docs/objects.inv + - https://docs.pola.rs/api/python/stable/objects.inv + options: + extensions: + - docs/griffe_extensions.py:SphinxRefsToAutorefs + docstring_style: google + docstring_options: + warn_unknown_params: false + returns_named_value: false + returns_multiple_items: false + show_source: false + members_order: source + inherited_members: true + show_root_heading: true + show_root_full_path: false + show_signature_annotations: true + separate_signature: true + merge_init_into_class: true + docstring_section_style: spacy + filters: ["!^_"] + - redirects: + redirect_maps: {} + +markdown_extensions: + - admonition + - attr_list + - def_list + - md_in_html + - footnotes + - tables + - toc: + permalink: true + toc_depth: 3 + - pymdownx.details + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + - pymdownx.tasklist: + custom_checkbox: true + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + +watch: + - python/datafusion + +hooks: + - docs/hooks.py + +nav: + - Home: index.md + - User Guide: + - user-guide/index.md + - Introduction: user-guide/introduction.md + - Concepts: user-guide/concepts.md + - Data Sources: user-guide/data-sources.md + - DataFrame: + - user-guide/dataframe/index.md + - Rendering: user-guide/dataframe/rendering.md + - Execution Metrics: user-guide/dataframe/execution-metrics.md + - Common Operations: + - user-guide/common-operations/index.md + - Basic Info: user-guide/common-operations/basic-info.md + - Views: user-guide/common-operations/views.md + - Select and Filter: user-guide/common-operations/select-and-filter.md + - Expressions: user-guide/common-operations/expressions.md + - Joins: user-guide/common-operations/joins.md + - Functions: user-guide/common-operations/functions.md + - Aggregations: user-guide/common-operations/aggregations.md + - Windows: user-guide/common-operations/windows.md + - User-Defined Functions: user-guide/common-operations/udf-and-udfa.md + - I/O: + - user-guide/io/index.md + - Arrow: user-guide/io/arrow.md + - Avro: user-guide/io/avro.md + - CSV: user-guide/io/csv.md + - JSON: user-guide/io/json.md + - Parquet: user-guide/io/parquet.md + - Table Provider: user-guide/io/table_provider.md + - Configuration: user-guide/configuration.md + - Distributing Work: user-guide/distributing-work.md + - SQL: user-guide/sql.md + - Upgrade Guides: user-guide/upgrade-guides.md + - AI Coding Assistants: user-guide/ai-coding-assistants.md + - Contributor Guide: + - contributor-guide/index.md + - Introduction: contributor-guide/introduction.md + - FFI: contributor-guide/ffi.md + - API Reference: + - reference/index.md + - datafusion: + - reference/datafusion/index.md + - catalog: reference/datafusion/catalog.md + - common: reference/datafusion/common.md + - context: reference/datafusion/context.md + - dataframe: reference/datafusion/dataframe.md + - dataframe_formatter: reference/datafusion/dataframe_formatter.md + - expr: reference/datafusion/expr.md + - functions: reference/datafusion/functions.md + - input: reference/datafusion/input.md + - io: reference/datafusion/io.md + - ipc: reference/datafusion/ipc.md + - object_store: reference/datafusion/object_store.md + - options: reference/datafusion/options.md + - plan: reference/datafusion/plan.md + - record_batch: reference/datafusion/record_batch.md + - substrait: reference/datafusion/substrait.md + - unparser: reference/datafusion/unparser.md + - user_defined: reference/datafusion/user_defined.md + - Links: links.md diff --git a/pyproject.toml b/pyproject.toml index e18c1d57c..69dde845e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -151,8 +151,10 @@ extend-allowed-calls = ["datafusion.lit", "lit"] "E", "ERA001", "EXE", + "INP001", "N817", "PLR", + "PTH", "S", "SIM", "T", @@ -176,8 +178,18 @@ extend-allowed-calls = ["datafusion.lit", "lit"] "TRY", "UP", ] -"docs/*" = ["D"] -"docs/source/conf.py" = ["ANN001", "ERA001", "INP001"] +"docs/*" = ["D", "INP001"] +# Notebook content cells originate from prose-driven user-guide pages +# where bare `print()` calls, magic comparison values, and per-cell +# re-imports are part of the explanation rather than production code. +"docs/source/**/*.ipynb" = [ + "B905", + "F811", + "ICN001", + "PLR2004", + "PTH118", + "T201", +] # CI and pre-commit invoke codespell with different paths, so we have a little # redundancy here, and we intentionally drop python in the path. @@ -214,14 +226,10 @@ dev = [ # from sdist under free-threaded interpreters (PyO3 < 3.14 support). release = ["pygithub==2.5.0"] docs = [ - "ipython>=8.12.3", - "jinja2>=3.1.5", - "myst-parser>=3.0.1", + "markdown-exec[ansi]>=1.10", + "mkdocs>=1.6,<2", + "mkdocs-material>=9.5,<10", + "mkdocs-redirects>=1.2", + "mkdocstrings[python]>=0.27", "pandas>=2.0.3", - "pickleshare>=0.7.5", - "pydata-sphinx-theme>=0.16,<0.17", - "setuptools>=75.3.0", - "sphinx-autoapi>=3.4.0", - "sphinx-reredirects>=0.1.5", - "sphinx>=7.1.2", ] diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py index 9c55f446c..3f6aed590 100644 --- a/python/datafusion/__init__.py +++ b/python/datafusion/__init__.py @@ -18,7 +18,7 @@ """DataFusion: an in-process query engine built on Apache Arrow. DataFusion is not a database -- it has no server and no external dependencies. -You create a :py:class:`SessionContext`, point it at data sources (Parquet, CSV, +You create a `SessionContext`, point it at data sources (Parquet, CSV, JSON, Arrow IPC, Pandas, Polars, or raw Python dicts/lists), and run queries using either SQL or the DataFrame API. @@ -27,26 +27,24 @@ - **SessionContext** -- entry point for loading data, running SQL, and creating DataFrames. - **DataFrame** -- lazy query builder. Every method returns a new DataFrame; - call :py:meth:`~datafusion.dataframe.DataFrame.collect` or a ``to_*`` + call :meth:`~datafusion.dataframe.DataFrame.collect` or a ``to_*`` method to execute. - **Expr** -- expression tree node for column references, literals, and function - calls. Build with :py:func:`col` and :py:func:`lit`. + calls. Build with :func:`~datafusion.col.col` and :func:`~datafusion.lit`. - **functions** -- 290+ built-in scalar, aggregate, and window functions. -Quick start ------------ - ->>> from datafusion import SessionContext, col ->>> from datafusion import functions as F ->>> ctx = SessionContext() ->>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) ->>> result = ( -... df.filter(col("a") > 1) -... .with_column("total", col("a") + col("b")) -... .aggregate([], [F.sum(col("total")).alias("grand_total")]) -... ) ->>> result.to_pydict() -{'grand_total': [16]} +Examples: + >>> from datafusion import SessionContext, col + >>> from datafusion import functions as F + >>> ctx = SessionContext() + >>> df = ctx.from_pydict({"a": [1, 2, 3], "b": [4, 5, 6]}) + >>> result = ( + ... df.filter(col("a") > 1) + ... .with_column("total", col("a") + col("b")) + ... .aggregate([], [F.sum(col("total")).alias("grand_total")]) + ... ) + >>> result.to_pydict() + {'grand_total': [16]} User guide and full documentation: https://datafusion.apache.org/python diff --git a/python/datafusion/catalog.py b/python/datafusion/catalog.py index 20da5e671..30f6fee11 100644 --- a/python/datafusion/catalog.py +++ b/python/datafusion/catalog.py @@ -220,7 +220,7 @@ def __repr__(self) -> str: @staticmethod @deprecated("Use Table() constructor instead.") def from_dataset(dataset: pa.dataset.Dataset) -> Table: - """Turn a :mod:`pyarrow.dataset` ``Dataset`` into a :class:`Table`.""" + """Turn a `dataset` ``Dataset`` into a :class:`~datafusion.catalog.Table`.""" return Table(dataset) @property @@ -239,7 +239,7 @@ class TableProviderFactory(ABC): @abstractmethod def create(self, cmd: CreateExternalTable) -> Table: - """Create a table using the :class:`CreateExternalTable`.""" + """Create a table using the `CreateExternalTable`.""" ... diff --git a/python/datafusion/context.py b/python/datafusion/context.py index 5dfeed719..52ee3811e 100644 --- a/python/datafusion/context.py +++ b/python/datafusion/context.py @@ -15,22 +15,22 @@ # specific language governing permissions and limitations # under the License. -""":py:class:`SessionContext` — entry point for running DataFusion queries. +"""`SessionContext` — entry point for running DataFusion queries. -A :py:class:`SessionContext` holds registered tables, catalogs, and +A `SessionContext` holds registered tables, catalogs, and configuration for the current session. It is the first object most programs create: from it you register data, run SQL strings -(:py:meth:`SessionContext.sql`), read files -(:py:meth:`SessionContext.read_csv`, -:py:meth:`SessionContext.read_parquet`, ...), and construct -:py:class:`~datafusion.dataframe.DataFrame` objects in memory -(:py:meth:`SessionContext.from_pydict`, -:py:meth:`SessionContext.from_arrow`). +(:meth:`~datafusion.context.SessionContext.sql`), read files +(:meth:`~datafusion.context.SessionContext.read_csv`, +:meth:`~datafusion.context.SessionContext.read_parquet`, ...), and construct +:class:`~datafusion.dataframe.DataFrame` objects in memory +(:meth:`~datafusion.context.SessionContext.from_pydict`, +:meth:`~datafusion.context.SessionContext.from_arrow`). Session behavior (memory limits, batch size, configured optimizer passes, -...) is controlled by :py:class:`SessionConfig` and -:py:class:`RuntimeEnvBuilder`; SQL dialect limits are controlled by -:py:class:`SQLOptions`. +...) is controlled by :class:`~datafusion.context.SessionConfig` and +`RuntimeEnvBuilder`; SQL dialect limits are controlled by +:class:`~datafusion.context.SQLOptions`. Examples: >>> ctx = dfn.SessionContext() @@ -38,7 +38,7 @@ >>> ctx.sql("SELECT 1 AS n").to_pydict() {'n': [1]} -See :ref:`user_guide_concepts` in the online documentation for the broader +See user_guide_concepts in the online documentation for the broader execution model. """ @@ -102,6 +102,18 @@ ) +__all__ = [ + "ArrowArrayExportable", + "ArrowStreamExportable", + "PhysicalOptimizerRuleExportable", + "RuntimeEnvBuilder", + "SQLOptions", + "SessionConfig", + "SessionContext", + "TableProviderExportable", +] + + class ArrowStreamExportable(Protocol): """Type hint for object exporting Arrow C Stream via Arrow PyCapsule Interface. @@ -147,7 +159,7 @@ class SessionConfig: """Session configuration options.""" def __init__(self, config_options: dict[str, str] | None = None) -> None: - """Create a new :py:class:`SessionConfig` with the given configuration options. + """Create a new `SessionConfig` with the given configuration options. Args: config_options: Configuration options. @@ -164,7 +176,7 @@ def with_create_default_catalog_and_schema( automatically created. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = ( self.config_internal.with_create_default_catalog_and_schema(enabled) @@ -181,7 +193,7 @@ def with_default_catalog_and_schema( schema: Schema name. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_default_catalog_and_schema( catalog, schema @@ -195,7 +207,7 @@ def with_information_schema(self, enabled: bool = True) -> SessionConfig: enabled: Whether to include ``information_schema`` virtual tables. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_information_schema(enabled) return self @@ -207,7 +219,7 @@ def with_batch_size(self, batch_size: int) -> SessionConfig: batch_size: Batch size. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_batch_size(batch_size) return self @@ -221,7 +233,7 @@ def with_target_partitions(self, target_partitions: int) -> SessionConfig: target_partitions: Number of target partitions. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_target_partitions( target_partitions @@ -237,7 +249,7 @@ def with_repartition_aggregations(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for aggregations. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_aggregations( enabled @@ -251,7 +263,7 @@ def with_repartition_joins(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for joins. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_joins(enabled) return self @@ -265,7 +277,7 @@ def with_repartition_windows(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for window functions. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_windows(enabled) return self @@ -279,7 +291,7 @@ def with_repartition_sorts(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for window functions. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_sorts(enabled) return self @@ -291,7 +303,7 @@ def with_repartition_file_scans(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use repartitioning for file scans. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_file_scans(enabled) return self @@ -303,7 +315,7 @@ def with_repartition_file_min_size(self, size: int) -> SessionConfig: size: Minimum file range size. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_repartition_file_min_size(size) return self @@ -317,7 +329,7 @@ def with_parquet_pruning(self, enabled: bool = True) -> SessionConfig: enabled: Whether to use pruning predicate for parquet readers. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_parquet_pruning(enabled) return self @@ -330,7 +342,7 @@ def set(self, key: str, value: str) -> SessionConfig: value: Option value. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.set(key, value) return self @@ -340,10 +352,10 @@ def with_extension(self, extension: Any) -> SessionConfig: Args: extension: A custom configuration extension object. These are - shared from another DataFusion extension library. + shared from another DataFusion extension library. Returns: - A new :py:class:`SessionConfig` object with the updated setting. + A new `SessionConfig` object with the updated setting. """ self.config_internal = self.config_internal.with_extension(extension) return self @@ -353,14 +365,14 @@ class RuntimeEnvBuilder: """Runtime configuration options.""" def __init__(self) -> None: - """Create a new :py:class:`RuntimeEnvBuilder` with default values.""" + """Create a new `RuntimeEnvBuilder` with default values.""" self.config_internal = RuntimeEnvBuilderInternal() def with_disk_manager_disabled(self) -> RuntimeEnvBuilder: """Disable the disk manager, attempts to create temporary files will error. Returns: - A new :py:class:`RuntimeEnvBuilder` object with the updated setting. + A new `RuntimeEnvBuilder` object with the updated setting. """ self.config_internal = self.config_internal.with_disk_manager_disabled() return self @@ -369,7 +381,7 @@ def with_disk_manager_os(self) -> RuntimeEnvBuilder: """Use the operating system's temporary directory for disk manager. Returns: - A new :py:class:`RuntimeEnvBuilder` object with the updated setting. + A new `RuntimeEnvBuilder` object with the updated setting. """ self.config_internal = self.config_internal.with_disk_manager_os() return self @@ -383,7 +395,7 @@ def with_disk_manager_specified( paths: Paths to use for the disk manager's temporary files. Returns: - A new :py:class:`RuntimeEnvBuilder` object with the updated setting. + A new `RuntimeEnvBuilder` object with the updated setting. """ paths_list = [str(p) for p in paths] self.config_internal = self.config_internal.with_disk_manager_specified( @@ -395,7 +407,7 @@ def with_unbounded_memory_pool(self) -> RuntimeEnvBuilder: """Use an unbounded memory pool. Returns: - A new :py:class:`RuntimeEnvBuilder` object with the updated setting. + A new `RuntimeEnvBuilder` object with the updated setting. """ self.config_internal = self.config_internal.with_unbounded_memory_pool() return self @@ -421,7 +433,7 @@ def with_fair_spill_pool(self, size: int) -> RuntimeEnvBuilder: size: Size of the memory pool in bytes. Returns: - A new :py:class:`RuntimeEnvBuilder` object with the updated setting. + A new `RuntimeEnvBuilder` object with the updated setting. Examples: >>> config = dfn.RuntimeEnvBuilder().with_fair_spill_pool(1024) @@ -433,14 +445,14 @@ def with_greedy_memory_pool(self, size: int) -> RuntimeEnvBuilder: """Use a greedy memory pool with the specified size. This pool works well for queries that do not need to spill or have a single - spillable operator. See :py:func:`with_fair_spill_pool` if there are + spillable operator. See `with_fair_spill_pool` if there are multiple spillable operators that all will spill. Args: size: Size of the memory pool in bytes. Returns: - A new :py:class:`RuntimeEnvBuilder` object with the updated setting. + A new `RuntimeEnvBuilder` object with the updated setting. Examples: >>> config = dfn.RuntimeEnvBuilder().with_greedy_memory_pool(1024) @@ -455,7 +467,7 @@ def with_temp_file_path(self, path: str | pathlib.Path) -> RuntimeEnvBuilder: path: Path to use for temporary files. Returns: - A new :py:class:`RuntimeEnvBuilder` object with the updated setting. + A new `RuntimeEnvBuilder` object with the updated setting. Examples: >>> config = dfn.RuntimeEnvBuilder().with_temp_file_path("/tmp") @@ -468,7 +480,7 @@ class SQLOptions: """Options to be used when performing SQL queries.""" def __init__(self) -> None: - """Create a new :py:class:`SQLOptions` with default values. + """Create a new `SQLOptions` with default values. The default values are: - DDL commands are allowed @@ -486,7 +498,7 @@ def with_allow_ddl(self, allow: bool = True) -> SQLOptions: allow: Allow DDL commands to be run. Returns: - A new :py:class:`SQLOptions` object with the updated setting. + A new `SQLOptions` object with the updated setting. Examples: >>> options = dfn.SQLOptions().with_allow_ddl(True) @@ -503,7 +515,7 @@ def with_allow_dml(self, allow: bool = True) -> SQLOptions: allow: Allow DML commands to be run. Returns: - A new :py:class:`SQLOptions` object with the updated setting. + A new `SQLOptions` object with the updated setting. Examples: >>> options = dfn.SQLOptions().with_allow_dml(True) @@ -530,7 +542,7 @@ def with_allow_statements(self, allow: bool = True) -> SQLOptions: class SessionContext: """This is the main interface for executing queries and creating DataFrames. - See :ref:`user_guide_concepts` in the online documentation for more information. + See user_guide_concepts in the online documentation for more information. """ def __init__( @@ -551,7 +563,7 @@ def __init__( Example usage: The following example demonstrates how to use the context to execute - a query against a CSV data source using the :py:class:`DataFrame` API:: + a query against a CSV data source using the `DataFrame` API:: from datafusion import SessionContext @@ -583,7 +595,7 @@ def enable_url_table(self) -> SessionContext: """Control if local files can be queried as tables. Returns: - A new :py:class:`SessionContext` object with url table enabled. + A new `SessionContext` object with url table enabled. """ klass = self.__class__ obj = klass.__new__(klass) @@ -597,7 +609,7 @@ def register_object_store( Args: schema: The data source schema. - store: The :py:class:`~datafusion.object_store.ObjectStore` to register. + store: The :mod:`~datafusion.object_store` to register. host: URL for the host. """ self.ctx.register_object_store(schema, store, host) @@ -622,8 +634,8 @@ def register_listing_table( ) -> None: """Register multiple files as a single table. - Registers a :py:class:`~datafusion.catalog.Table` that can assemble multiple - files from locations in an :py:class:`~datafusion.object_store.ObjectStore` + Registers a :class:`~datafusion.catalog.Table` that can assemble multiple + files from locations in an :mod:`~datafusion.object_store` instance. Args: @@ -655,7 +667,7 @@ def sql( param_values: dict[str, Any] | None = None, **named_params: Any, ) -> DataFrame: - """Create a :py:class:`~datafusion.DataFrame` from SQL query text. + """Create a :class:`~datafusion.dataframe.DataFrame` from SQL query text. See the online documentation for a description of how to perform parameterized substitution via either the ``param_values`` option @@ -664,7 +676,7 @@ def sql( Note: This API implements DDL statements such as ``CREATE TABLE`` and ``CREATE VIEW`` and DML statements such as ``INSERT INTO`` with in-memory default implementation.See - :py:func:`~datafusion.context.SessionContext.sql_with_options`. + :meth:`~datafusion.context.SessionContext.sql_with_options`. Args: query: SQL query text. @@ -720,7 +732,7 @@ def sql_with_options( param_values: dict[str, Any] | None = None, **named_params: Any, ) -> DataFrame: - """Create a :py:class:`~datafusion.dataframe.DataFrame` from SQL query text. + """Create a :class:`~datafusion.dataframe.DataFrame` from SQL query text. This function will first validate that the query is allowed by the provided options. @@ -748,7 +760,7 @@ def create_dataframe( """Create and return a dataframe using the provided partitions. Args: - partitions: :py:class:`pa.RecordBatch` partitions to register. + partitions: :class:`~pyarrow.RecordBatch` partitions to register. name: Resultant dataframe name. schema: Schema for the partitions. @@ -758,7 +770,7 @@ def create_dataframe( return DataFrame(self.ctx.create_dataframe(partitions, name, schema)) def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame: - """Create a :py:class:`~datafusion.dataframe.DataFrame` from an existing plan. + """Create a :class:`~datafusion.dataframe.DataFrame` from an existing plan. Args: plan: Logical plan. @@ -771,7 +783,7 @@ def create_dataframe_from_logical_plan(self, plan: LogicalPlan) -> DataFrame: def from_pylist( self, data: list[dict[str, Any]], name: str | None = None ) -> DataFrame: - """Create a :py:class:`~datafusion.dataframe.DataFrame` from a list. + """Create a :class:`~datafusion.dataframe.DataFrame` from a list. Args: data: List of dictionaries. @@ -785,7 +797,7 @@ def from_pylist( def from_pydict( self, data: dict[str, list[Any]], name: str | None = None ) -> DataFrame: - """Create a :py:class:`~datafusion.dataframe.DataFrame` from a dictionary. + """Create a :class:`~datafusion.dataframe.DataFrame` from a dictionary. Args: data: Dictionary of lists. @@ -801,7 +813,7 @@ def from_arrow( data: ArrowStreamExportable | ArrowArrayExportable, name: str | None = None, ) -> DataFrame: - """Create a :py:class:`~datafusion.dataframe.DataFrame` from an Arrow source. + """Create a :class:`~datafusion.dataframe.DataFrame` from an Arrow source. The Arrow data source can be any object that implements either ``__arrow_c_stream__`` or ``__arrow_c_array__``. For the latter, it must return @@ -819,7 +831,7 @@ def from_arrow( return DataFrame(self.ctx.from_arrow(data, name)) def from_pandas(self, data: pd.DataFrame, name: str | None = None) -> DataFrame: - """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Pandas DataFrame. + """Create a `DataFrame` from a Pandas DataFrame. Args: data: Pandas DataFrame. @@ -831,7 +843,7 @@ def from_pandas(self, data: pd.DataFrame, name: str | None = None) -> DataFrame: return DataFrame(self.ctx.from_pandas(data, name)) def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame: - """Create a :py:class:`~datafusion.dataframe.DataFrame` from a Polars DataFrame. + """Create a `DataFrame` from a Polars DataFrame. Args: data: Polars DataFrame. @@ -845,7 +857,7 @@ def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame: # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116 # is the discussion on how we arrived at adding register_view def register_view(self, name: str, df: DataFrame) -> None: - """Register a :py:class:`~datafusion.dataframe.DataFrame` as a view. + """Register a :class:`~datafusion.dataframe.DataFrame` as a view. Args: name (str): The name to register the view under. @@ -859,14 +871,14 @@ def register_table( name: str, table: Table | TableProviderExportable | DataFrame | pa.dataset.Dataset, ) -> None: - """Register a :py:class:`~datafusion.Table` with this context. + """Register a :class:`~datafusion.catalog.Table` with this context. The registered table can be referenced from SQL statements executed against this context. Args: name: Name of the resultant table. - table: Any object that can be converted into a :class:`Table`. + table: Any object that can be converted into a `Table`. """ self.ctx.register_table(name, table) @@ -879,14 +891,14 @@ def register_table_factory( format: str, factory: TableProviderFactory | TableProviderFactoryExportable, ) -> None: - """Register a :py:class:`~datafusion.TableProviderFactoryExportable`. + """Register a `TableProviderFactoryExportable`. The registered factory can be referenced from SQL DDL statements executed against this context. Args: format: The value to be used in `STORED AS ${format}` clause. - factory: A PyCapsule that implements :class:`TableProviderFactoryExportable` + factory: A PyCapsule that implements `TableProviderFactoryExportable` """ self.ctx.register_table_factory(format, factory) @@ -921,7 +933,9 @@ def register_table_provider( ) -> None: """Register a table provider. - Deprecated: use :meth:`register_table` instead. + Deprecated: use + :meth:`~datafusion.context.SessionContext.register_table` + instead. """ self.register_table(name, provider) @@ -930,7 +944,7 @@ def register_udtf(self, func: TableFunction) -> None: self.ctx.register_udtf(func._udtf) def register_batch(self, name: str, batch: pa.RecordBatch) -> None: - """Register a single :py:class:`pa.RecordBatch` as a table. + """Register a single :class:`~pyarrow.RecordBatch` as a table. Args: name: Name of the resultant table. @@ -973,12 +987,14 @@ def register_record_batches( self.ctx.register_record_batches(name, partitions) def read_batch(self, batch: pa.RecordBatch) -> DataFrame: - """Return a :py:class:`~datafusion.DataFrame` reading a single batch. + """Return a `DataFrame` reading a single batch. - Convenience wrapper around :py:meth:`read_batches` for the single-batch - case. Unlike :py:meth:`register_batch`, this does not register the - batch as a named table; it returns an anonymous - :py:class:`~datafusion.DataFrame` directly. + Convenience wrapper around + :meth:`~datafusion.context.SessionContext.read_batches` for the + single-batch case. Unlike + :meth:`~datafusion.context.SessionContext.register_batch`, this + does not register the batch as a named table; it returns an anonymous + :class:`~datafusion.dataframe.DataFrame` directly. Args: batch: Record batch to wrap as a DataFrame. @@ -992,14 +1008,14 @@ def read_batch(self, batch: pa.RecordBatch) -> DataFrame: return self.read_batches([batch]) def read_batches(self, batches: Iterable[pa.RecordBatch]) -> DataFrame: - """Return a :py:class:`~datafusion.DataFrame` reading the given batches. + """Return a `DataFrame` reading the given batches. All batches must share the same schema. Any iterable of - :py:class:`pa.RecordBatch` is accepted (list, tuple, generator); + :class:`~pyarrow.RecordBatch` is accepted (list, tuple, generator); it is materialized into a list before being handed to the - underlying Rust binding. Unlike :py:meth:`register_record_batches`, + underlying Rust binding. Unlike `register_record_batches`, this does not register the batches as a named table; it returns - an anonymous :py:class:`~datafusion.DataFrame` directly. + an anonymous :class:`~datafusion.dataframe.DataFrame` directly. Args: batches: Record batches to wrap as a DataFrame. @@ -1279,7 +1295,7 @@ def register_arrow( ) def register_dataset(self, name: str, dataset: pa.dataset.Dataset) -> None: - """Register a :py:class:`pa.dataset.Dataset` as a table. + """Register a :class:`~pyarrow.dataset.Dataset` as a table. Args: name: Name of the table to register. @@ -1326,9 +1342,10 @@ def deregister_udwf(self, name: str) -> None: def udf(self, name: str) -> ScalarUDF: """Look up a registered scalar UDF by name. - Returns the same ``ScalarUDF`` wrapper that :py:meth:`register_udf` - accepts, so it can be invoked as an expression in the DataFrame API - or re-registered into a different :py:class:`SessionContext`. + Returns the same ``ScalarUDF`` wrapper that + :meth:`~datafusion.context.SessionContext.register_udf` accepts, + so it can be invoked as an expression in the DataFrame API + or re-registered into a different `SessionContext`. Built-in scalar functions from the session's function registry are also looked up. @@ -1372,9 +1389,10 @@ def udf(self, name: str) -> ScalarUDF: def udaf(self, name: str) -> AggregateUDF: """Look up a registered aggregate UDF by name. - Returns the same ``AggregateUDF`` wrapper that :py:meth:`register_udaf` - accepts. Built-in aggregate functions such as ``sum`` or ``avg`` are - also discoverable through this lookup. See :py:meth:`udf` for a worked + Returns the same ``AggregateUDF`` wrapper that + :meth:`~datafusion.context.SessionContext.register_udaf` accepts. + Built-in aggregate functions such as ``sum`` or ``avg`` are + also discoverable through this lookup. See `udf` for a worked late-binding example; the pattern is identical for aggregates. Args: @@ -1385,7 +1403,7 @@ def udaf(self, name: str) -> AggregateUDF: Examples: Look up a built-in aggregate by name and use it in - :py:meth:`~datafusion.DataFrame.aggregate`: + :meth:`~datafusion.dataframe.DataFrame.aggregate`: >>> ctx = dfn.SessionContext() >>> sum_fn = ctx.udaf("sum") @@ -1402,9 +1420,10 @@ def udaf(self, name: str) -> AggregateUDF: def udwf(self, name: str) -> WindowUDF: """Look up a registered window UDF by name. - Returns the same ``WindowUDF`` wrapper that :py:meth:`register_udwf` - accepts. Built-in window functions such as ``row_number`` or ``rank`` - are also discoverable through this lookup. See :py:meth:`udf` for a + Returns the same ``WindowUDF`` wrapper that + :meth:`~datafusion.context.SessionContext.register_udwf` accepts. + Built-in window functions such as ``row_number`` or ``rank`` + are also discoverable through this lookup. See `udf` for a worked late-binding example; the pattern is identical for window functions. @@ -1432,7 +1451,7 @@ def udfs(self) -> list[str]: """Return the sorted names of all registered scalar UDFs. Includes both user-registered and built-in scalar functions. Pair - with :py:meth:`udf` to drive discovery, validation, or config-based + with `udf` to drive discovery, validation, or config-based dispatch. Examples: @@ -1475,11 +1494,11 @@ def table_exist(self, name: str) -> bool: return self.ctx.table_exist(name) def empty_table(self) -> DataFrame: - """Create an empty :py:class:`~datafusion.dataframe.DataFrame`.""" + """Create an empty :class:`~datafusion.dataframe.DataFrame`.""" return DataFrame(self.ctx.empty_table()) def session_id(self) -> str: - """Return an id that uniquely identifies this :py:class:`SessionContext`.""" + """Return an id that uniquely identifies this `SessionContext`.""" return self.ctx.session_id() def session_start_time(self) -> str: @@ -1503,7 +1522,7 @@ def enable_ident_normalization(self) -> bool: return self.ctx.enable_ident_normalization() def copied_config(self) -> SessionConfig: - """Return a copy of the active :py:class:`SessionConfig`. + """Return a copy of the active `SessionConfig`. Mutating the returned config does not affect this context; use the result when you need a starting point for a new context or @@ -1527,7 +1546,7 @@ def parse_capacity_limit(config_name: str, limit: str) -> int: ``"0"`` is accepted and returns 0. ``config_name`` is used purely for error messages and identifies which configuration setting the limit belongs to. Use this helper when constructing a - :py:class:`RuntimeEnvBuilder` from a human-friendly size string. + `RuntimeEnvBuilder` from a human-friendly size string. Examples: >>> SessionContext.parse_capacity_limit( @@ -1563,7 +1582,7 @@ def parse_sql_expr(self, sql: str, schema: DFSchema) -> Expr: return Expr(self.ctx.parse_sql_expr(sql, schema)) def execute_logical_plan(self, plan: LogicalPlan) -> DataFrame: - """Execute a :py:class:`~datafusion.plan.LogicalPlan` and return a DataFrame. + """Execute a `LogicalPlan` and return a DataFrame. Args: plan: Logical plan to execute. @@ -1618,13 +1637,13 @@ def add_physical_optimizer_rule( The rule is imported via its ``__datafusion_physical_optimizer_rule__`` PyCapsule, typically produced by a separate compiled extension. The - underlying :class:`SessionState` is rebuilt from its current state + underlying `SessionState` is rebuilt from its current state with the new rule appended, so previously registered tables, UDFs, and catalogs are preserved. Args: - rule: Object exposing ``__datafusion_physical_optimizer_rule__``, - a :class:`PhysicalOptimizerRuleExportable`. + rule: Object exposing ``__datafusion_physical_optimizer_rule__`` — a + :class:`~datafusion.context.PhysicalOptimizerRuleExportable`. Examples: >>> from datafusion import SessionContext @@ -1636,7 +1655,7 @@ def add_physical_optimizer_rule( self.ctx.add_physical_optimizer_rule(rule) def table_provider(self, name: str) -> Table: - """Return the :py:class:`~datafusion.catalog.Table` for the given table name. + """Return the :class:`~datafusion.catalog.Table` for the given table name. Args: name: Name of the table. @@ -1782,7 +1801,7 @@ def read_parquet( schema: pa.Schema | None = None, file_sort_order: Sequence[Sequence[SortKey]] | None = None, ) -> DataFrame: - """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. + """Read a Parquet source into a :class:`~datafusion.dataframe.DataFrame`. Args: path: Path to the Parquet file. @@ -1827,7 +1846,7 @@ def read_avro( file_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_extension: str = ".avro", ) -> DataFrame: - """Create a :py:class:`DataFrame` for reading Avro data source. + """Create a `DataFrame` for reading Avro data source. Args: path: Path to the Avro file. @@ -1852,7 +1871,7 @@ def read_arrow( file_extension: str = ".arrow", file_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, ) -> DataFrame: - """Create a :py:class:`DataFrame` for reading an Arrow IPC data source. + """Create a `DataFrame` for reading an Arrow IPC data source. Args: path: Path to the Arrow IPC file. @@ -1918,17 +1937,18 @@ def read_arrow( ) def read_empty(self) -> DataFrame: - """Create an empty :py:class:`DataFrame` with no columns or rows. + """Create an empty `DataFrame` with no columns or rows. See Also: - This is an alias for :meth:`empty_table`. + This is an alias for + :meth:`~datafusion.context.SessionContext.empty_table`. """ return self.empty_table() def read_table( self, table: Table | TableProviderExportable | DataFrame | pa.dataset.Dataset ) -> DataFrame: - """Creates a :py:class:`~datafusion.dataframe.DataFrame` from a table.""" + """Creates a :class:`~datafusion.dataframe.DataFrame` from a table.""" return DataFrame(self.ctx.read_table(table)) def execute(self, plan: ExecutionPlan, partitions: int) -> RecordBatchStream: @@ -1943,7 +1963,7 @@ def _convert_file_sort_order( Each ``SortKey`` can be a column name string, an ``Expr``, or a ``SortExpr`` and will be converted using - :func:`datafusion.expr.sort_list_to_raw_sort_list`. + :func:`~datafusion.expr.sort_list_to_raw_sort_list`. """ # Convert each ``SortKey`` in the provided sort order to the low-level # representation expected by the Rust bindings. @@ -2006,7 +2026,7 @@ def with_logical_extension_codec( Only FFI codecs are supported. Pass any object implementing ``__datafusion_logical_extension_codec__`` (see - :py:class:`~datafusion.user_defined.LogicalExtensionCodecExportable`). + `LogicalExtensionCodecExportable`). """ new_internal = self.ctx.with_logical_extension_codec(codec) new = SessionContext.__new__(SessionContext) @@ -2024,7 +2044,7 @@ def with_physical_extension_codec( Only FFI codecs are supported. Pass any object implementing ``__datafusion_physical_extension_codec__`` (see - :py:class:`~datafusion.user_defined.PhysicalExtensionCodecExportable`). + `PhysicalExtensionCodecExportable`). """ new_internal = self.ctx.with_physical_extension_codec(codec) new = SessionContext.__new__(SessionContext) @@ -2049,24 +2069,24 @@ def with_python_udf_inlining(self, *, enabled: bool) -> SessionContext: * **Cross-language portability.** The bytes can be decoded by a non-Python receiver, which must already have UDFs registered under matching names. - * **Safer deserialization.** :meth:`Expr.from_bytes` will refuse + * **Safer deserialization.** `from_bytes` will refuse to rebuild Python UDFs rather than call ``cloudpickle.loads`` on untrusted input. - The setting affects :meth:`Expr.to_bytes` and - :meth:`Expr.from_bytes` whenever this session is passed as the - ``ctx`` argument. :func:`pickle.dumps` and :func:`pickle.loads` + The setting affects :meth:`~datafusion.expr.Expr.to_bytes` and + `from_bytes` whenever this session is passed as the + ``ctx`` argument. :func:`~pickle.dumps` and :func:`~pickle.loads` do not pass a context, so to apply the setting through pickle, register this session with - :func:`datafusion.ipc.set_sender_ctx` on the sender and - :func:`datafusion.ipc.set_worker_ctx` on the receiver. + :func:`~datafusion.ipc.set_sender_ctx` on the sender and + :func:`~datafusion.ipc.set_worker_ctx` on the receiver. .. warning:: Security - This setting narrows only :meth:`Expr.from_bytes`. Calling - :func:`pickle.loads` on untrusted bytes remains unsafe + This setting narrows only `from_bytes`. Calling + :func:`~pickle.loads` on untrusted bytes remains unsafe regardless of the toggle. - Returns a new :class:`SessionContext` with the toggle applied; + Returns a new `SessionContext` with the toggle applied; the original session is unchanged. Examples: diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index de00ff474..460f07c7f 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -14,23 +14,23 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -""":py:class:`DataFrame` — lazy, chainable query representation. +""":class:`~datafusion.dataframe.DataFrame` — lazy, chainable query representation. -A :py:class:`DataFrame` is a logical plan over one or more data sources. -Methods that reshape the plan (:py:meth:`DataFrame.select`, -:py:meth:`DataFrame.filter`, :py:meth:`DataFrame.aggregate`, -:py:meth:`DataFrame.sort`, :py:meth:`DataFrame.join`, -:py:meth:`DataFrame.limit`, the set-operation methods, ...) return a new -:py:class:`DataFrame` and do no work until a terminal method such as -:py:meth:`DataFrame.collect`, :py:meth:`DataFrame.to_pydict`, -:py:meth:`DataFrame.show`, or one of the ``write_*`` methods is called. +A `DataFrame` is a logical plan over one or more data sources. +Methods that reshape the plan (:meth:`~datafusion.dataframe.DataFrame.select`, +`filter`, `aggregate`, +`sort`, :meth:`~datafusion.dataframe.DataFrame.join`, +`limit`, the set-operation methods, ...) return a new +`DataFrame` and do no work until a terminal method such as +`collect`, :meth:`~datafusion.dataframe.DataFrame.to_pydict`, +`show`, or one of the ``write_*`` methods is called. DataFrames are produced from a -:py:class:`~datafusion.context.SessionContext`, typically via -:py:meth:`~datafusion.context.SessionContext.sql`, -:py:meth:`~datafusion.context.SessionContext.read_csv`, -:py:meth:`~datafusion.context.SessionContext.read_parquet`, or -:py:meth:`~datafusion.context.SessionContext.from_pydict`. +:class:`~datafusion.context.SessionContext`, typically via +:meth:`~datafusion.context.SessionContext.sql`, +:meth:`~datafusion.context.SessionContext.read_csv`, +:meth:`~datafusion.context.SessionContext.read_parquet`, or +:meth:`~datafusion.context.SessionContext.from_pydict`. Examples: >>> ctx = dfn.SessionContext() @@ -38,7 +38,7 @@ >>> df.filter(col("a") > 1).select("b").to_pydict() {'b': [20, 30]} -See :ref:`user_guide_concepts` in the online documentation for a high-level +See user_guide_concepts in the online documentation for a high-level overview of the execution model. """ @@ -88,11 +88,21 @@ from enum import Enum +__all__ = [ + "Compression", + "DataFrame", + "DataFrameWriteOptions", + "ExplainFormat", + "InsertOp", + "ParquetColumnOptions", + "ParquetWriterOptions", +] + class ExplainFormat(Enum): """Output format for explain plans. - Controls how the query plan is rendered in :py:meth:`DataFrame.explain`. + Controls how the query plan is rendered in `explain`. """ INDENT = "indent" @@ -348,16 +358,16 @@ class DataFrame: """Two dimensional table representation of data. DataFrame objects are iterable; iterating over a DataFrame yields - :class:`datafusion.RecordBatch` instances lazily. + :class:`~datafusion.RecordBatch` instances lazily. - See :ref:`user_guide_concepts` in the online documentation for more information. + See user_guide_concepts in the online documentation for more information. """ def __init__(self, df: DataFrameInternal) -> None: """This constructor is not to be used by the end user. - See :py:class:`~datafusion.context.SessionContext` for methods to - create a :py:class:`DataFrame`. + See :class:`~datafusion.context.SessionContext` for methods to + create a :class:`~datafusion.dataframe.DataFrame`. """ self.df = df @@ -379,7 +389,7 @@ def into_view(self, temporary: bool = False) -> Table: return _Table(self.df.into_view(temporary)) def __getitem__(self, key: str | list[str]) -> DataFrame: - """Return a new :py:class:`DataFrame` with the specified column or columns. + """Return a new `DataFrame` with the specified column or columns. Args: key: Column name or list of column names to select. @@ -428,7 +438,7 @@ def describe(self) -> DataFrame: return DataFrame(self.df.describe()) def schema(self) -> pa.Schema: - """Return the :py:class:`pyarrow.Schema` of this DataFrame. + """Return the :class:`~pyarrow.Schema` of this DataFrame. The output schema contains information on the name, data type, and nullability for each column. @@ -442,7 +452,7 @@ def column(self, name: str) -> Expr: """Return a fully qualified column expression for ``name``. Resolves an unqualified column name against this DataFrame's schema - and returns an :py:class:`Expr` whose underlying column reference + and returns an :class:`~datafusion.expr.Expr` whose underlying column reference includes the table qualifier. This is especially useful after joins, where the same column name may appear in multiple relations. @@ -477,17 +487,17 @@ def column(self, name: str) -> Expr: return self.find_qualified_columns(name)[0] def col(self, name: str) -> Expr: - """Alias for :py:meth:`column`. + """Alias for :func:`~datafusion.col.column`. See Also: - :py:meth:`column` + :func:`~datafusion.col.column` """ return self.column(name) def find_qualified_columns(self, *names: str) -> list[Expr]: """Return fully qualified column expressions for the given names. - This is a batch version of :py:meth:`column` — it resolves each + This is a batch version of :func:`~datafusion.col.column` — it resolves each unqualified name against the DataFrame's schema and returns a list of qualified column expressions. @@ -524,7 +534,7 @@ def select_exprs(self, *args: str) -> DataFrame: return self.df.select_exprs(*args) def alias(self, alias: str) -> DataFrame: - """Assign a table alias to this :py:class:`DataFrame`. + """Assign a table alias to this :class:`~datafusion.dataframe.DataFrame`. Replaces the qualifiers of the output columns with ``alias``. Useful for self-joins and any situation that needs an unambiguous table-style @@ -550,13 +560,13 @@ def alias(self, alias: str) -> DataFrame: return DataFrame(self.df.alias(alias)) def select(self, *exprs: Expr | str) -> DataFrame: - """Project arbitrary expressions into a new :py:class:`DataFrame`. + """Project arbitrary expressions into a new `DataFrame`. - String arguments are treated as column names; :py:class:`~datafusion.expr.Expr` + String arguments are treated as column names; :class:`~datafusion.expr.Expr` arguments can reshape, rename, or compute new columns. Args: - exprs: Either column names or :py:class:`~datafusion.expr.Expr` to select. + exprs: Either column names or :class:`~datafusion.expr.Expr` to select. Returns: DataFrame after projection. It has one column for each expression. @@ -645,9 +655,9 @@ def filter(self, *predicates: Expr | str) -> DataFrame: out. If more than one predicate is provided, these predicates will be combined as a logical AND. Each ``predicate`` can be an :class:`~datafusion.expr.Expr` created using helper functions such as - :func:`datafusion.col` or :func:`datafusion.lit`, or a SQL expression string + `col` or :func:`~datafusion.lit`, or a SQL expression string that will be parsed against the DataFrame schema. If more complex logic is - required, see the logical operations in :py:mod:`~datafusion.functions`. + required, see the logical operations in :mod:`~datafusion.functions`. Examples: >>> ctx = dfn.SessionContext() @@ -697,7 +707,7 @@ def with_column(self, name: str, expr: Expr | str) -> DataFrame: """Add an additional column to the DataFrame. The ``expr`` must be an :class:`~datafusion.expr.Expr` constructed with - :func:`datafusion.col` or :func:`datafusion.lit`, or a SQL expression + :func:`~datafusion.col.col` or :func:`~datafusion.lit`, or a SQL expression string that will be parsed against the DataFrame schema. Examples: @@ -725,7 +735,7 @@ def with_columns( By passing expressions, iterables of expressions, string SQL expressions, or named expressions. All expressions must be :class:`~datafusion.expr.Expr` objects created via - :func:`datafusion.col` or :func:`datafusion.lit`, or SQL expression strings. + `col` or :func:`~datafusion.lit`, or SQL expression strings. To pass named expressions use the form ``name=Expr``. Example usage: The following will add 4 columns labeled ``a``, ``b``, ``c``, @@ -806,18 +816,18 @@ def aggregate( By default each unique combination of the ``group_by`` columns produces one row. To get multiple levels of subtotals in a single pass, pass a - :py:class:`~datafusion.expr.GroupingSet` expression + :class:`~datafusion.expr.GroupingSet` expression (created via - :py:meth:`~datafusion.expr.GroupingSet.rollup`, - :py:meth:`~datafusion.expr.GroupingSet.cube`, or - :py:meth:`~datafusion.expr.GroupingSet.grouping_sets`) + :meth:`~datafusion.expr.GroupingSet.rollup`, + :meth:`~datafusion.expr.GroupingSet.cube`, or + :meth:`~datafusion.expr.GroupingSet.grouping_sets`) as the ``group_by`` argument. See the - :ref:`aggregation` user guide for detailed examples. + aggregation user guide for detailed examples. Args: group_by: Sequence of expressions or column names to group by, or ``None`` for aggregation over the whole DataFrame. - A :py:class:`~datafusion.expr.GroupingSet` expression may + A :class:`~datafusion.expr.GroupingSet` expression may be included to produce multiple grouping levels (rollup, cube, or explicit grouping sets). aggs: Sequence of expressions to aggregate. @@ -867,7 +877,7 @@ def sort(self, *exprs: SortKey) -> DataFrame: Note that any expression can be turned into a sort expression by calling its ``sort`` method. For ascending-only sorts, the shorter - :py:meth:`sort_by` is usually more convenient. + :meth:`~datafusion.dataframe.DataFrame.sort_by` is usually more convenient. Args: exprs: Sort expressions or column names, applied in order. @@ -883,7 +893,7 @@ def sort(self, *exprs: SortKey) -> DataFrame: >>> df.sort("a").to_pydict() {'a': [1, 2, 3], 'b': [20, 30, 10]} - Sort descending using :py:meth:`Expr.sort`: + Sort descending using :meth:`~datafusion.expr.Expr.sort`: >>> df.sort(col("a").sort(ascending=False)).to_pydict() {'a': [3, 2, 1], 'b': [10, 30, 20]} @@ -904,10 +914,12 @@ def cast(self, mapping: dict[str, pa.DataType[Any]]) -> DataFrame: return self.with_columns(exprs) def limit(self, count: int, offset: int = 0) -> DataFrame: - """Return a new :py:class:`DataFrame` with a limited number of rows. + """Return a new `DataFrame` with a limited number of rows. Results are returned in unspecified order unless the DataFrame is - explicitly sorted first via :py:meth:`sort` or :py:meth:`sort_by`. + explicitly sorted first via + :meth:`~datafusion.dataframe.DataFrame.sort` or + :meth:`~datafusion.dataframe.DataFrame.sort_by`. Args: count: Number of rows to limit the DataFrame to. @@ -932,7 +944,7 @@ def limit(self, count: int, offset: int = 0) -> DataFrame: return DataFrame(self.df.limit(count, offset)) def head(self, n: int = 5) -> DataFrame: - """Return a new :py:class:`DataFrame` with a limited number of rows. + """Return a new `DataFrame` with a limited number of rows. Args: n: Number of rows to take from the head of the DataFrame. @@ -943,7 +955,7 @@ def head(self, n: int = 5) -> DataFrame: return DataFrame(self.df.limit(n, 0)) def tail(self, n: int = 5) -> DataFrame: - """Return a new :py:class:`DataFrame` with a limited number of rows. + """Return a new `DataFrame` with a limited number of rows. Be aware this could be potentially expensive since the row size needs to be determined of the dataframe. This is done by collecting it. @@ -957,19 +969,19 @@ def tail(self, n: int = 5) -> DataFrame: return DataFrame(self.df.limit(n, max(0, self.count() - n))) def collect(self) -> list[pa.RecordBatch]: - """Execute this :py:class:`DataFrame` and collect results into memory. + """Execute this `DataFrame` and collect results into memory. Prior to calling ``collect``, modifying a DataFrame simply updates a plan (no actual computation is performed). Calling ``collect`` triggers the computation. Returns: - List of :py:class:`pyarrow.RecordBatch` collected from the DataFrame. + List of :class:`~pyarrow.RecordBatch` collected from the DataFrame. """ return self.df.collect() def collect_column(self, column_name: str) -> pa.Array | pa.ChunkedArray: - """Executes this :py:class:`DataFrame` for a single column.""" + """Executes this `DataFrame` for a single column.""" return self.df.collect_column(column_name) def cache(self) -> DataFrame: @@ -983,11 +995,11 @@ def cache(self) -> DataFrame: def collect_partitioned(self) -> list[list[pa.RecordBatch]]: """Execute this DataFrame and collect all partitioned results. - This operation returns :py:class:`pyarrow.RecordBatch` maintaining the input + This operation returns `RecordBatch` maintaining the input partitioning. Returns: - List of list of :py:class:`RecordBatch` collected from the + List of list of `RecordBatch` collected from the DataFrame. """ return self.df.collect_partitioned() @@ -1001,7 +1013,7 @@ def show(self, num: int = 20) -> None: self.df.show(num) def distinct(self) -> DataFrame: - """Return a new :py:class:`DataFrame` with all duplicated rows removed. + """Return a new `DataFrame` with all duplicated rows removed. Returns: DataFrame after removing duplicates. @@ -1058,15 +1070,15 @@ def join( join_keys: tuple[list[str], list[str]] | None = None, coalesce_duplicate_keys: bool = True, ) -> DataFrame: - """Join this :py:class:`DataFrame` with another :py:class:`DataFrame`. + """Join this `DataFrame` with another `DataFrame`. ``on`` has to be provided or both ``left_on`` and ``right_on`` in conjunction. When non-key columns share the same name in both DataFrames, use - :py:meth:`DataFrame.col` on each DataFrame **before** the join to + `col` on each DataFrame **before** the join to obtain fully qualified column references that can disambiguate them. - See :py:meth:`join_on` for an example. + See :meth:`~datafusion.dataframe.DataFrame.join_on` for an example. Args: right: Other DataFrame to join with. @@ -1156,13 +1168,13 @@ def join_on( *on_exprs: Expr, how: Literal["inner", "left", "right", "full", "semi", "anti"] = "inner", ) -> DataFrame: - """Join two :py:class:`DataFrame` using the specified expressions. + """Join two `DataFrame` using the specified expressions. Join predicates must be :class:`~datafusion.expr.Expr` objects, typically - built with :func:`datafusion.col`. On expressions are used to support + built with :func:`~datafusion.col.col`. On expressions are used to support in-equality predicates. Equality predicates are correctly optimized. - Use :py:meth:`DataFrame.col` on each DataFrame **before** the join to + Use `col` on each DataFrame **before** the join to obtain fully qualified column references. These qualified references can then be used in the join predicate and to disambiguate columns with the same name when selecting from the result. @@ -1178,7 +1190,7 @@ def join_on( ... ).sort(col("x")).to_pydict() {'a': [1, 2], 'x': ['a', 'b'], 'b': [1, 2], 'y': ['c', 'd']} - Use :py:meth:`col` to disambiguate shared column names: + Use :func:`~datafusion.col.col` to disambiguate shared column names: >>> left = ctx.from_pydict({"id": [1, 2], "val": [10, 20]}) >>> right = ctx.from_pydict({"id": [1, 2], "val": [30, 40]}) @@ -1216,7 +1228,7 @@ def explain( verbose: If ``True``, more details will be included. analyze: If ``True``, the plan will run and metrics reported. format: Output format for the plan. Defaults to - :py:attr:`ExplainFormat.INDENT`. + :attr:`~datafusion.dataframe.ExplainFormat.INDENT`. Examples: Show the plan in tree format: @@ -1287,9 +1299,9 @@ def repartition_by_hash(self, *exprs: Expr | str, num: int) -> DataFrame: return DataFrame(self.df.repartition_by_hash(*exprs, num=num)) def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: - """Calculate the union of two :py:class:`DataFrame`. + """Calculate the union of two :class:`~datafusion.dataframe.DataFrame`. - The two :py:class:`DataFrame` must have exactly the same schema. + The two `DataFrame` must have exactly the same schema. Args: other: DataFrame to union with. @@ -1318,17 +1330,17 @@ def union(self, other: DataFrame, distinct: bool = False) -> DataFrame: "union_distinct() is deprecated. Use union(other, distinct=True) instead." ) def union_distinct(self, other: DataFrame) -> DataFrame: - """Calculate the distinct union of two :py:class:`DataFrame`. + """Calculate the distinct union of two `DataFrame`. See Also: - :py:meth:`union` + :meth:`~datafusion.dataframe.DataFrame.union` """ return self.union(other, distinct=True) def intersect(self, other: DataFrame, distinct: bool = False) -> DataFrame: - """Calculate the intersection of two :py:class:`DataFrame`. + """Calculate the intersection of two `DataFrame`. - The two :py:class:`DataFrame` must have exactly the same schema. + The two `DataFrame` must have exactly the same schema. Args: other: DataFrame to intersect with. @@ -1356,11 +1368,11 @@ def intersect(self, other: DataFrame, distinct: bool = False) -> DataFrame: return DataFrame(self.df.intersect(other.df, distinct)) def except_all(self, other: DataFrame, distinct: bool = False) -> DataFrame: - """Calculate the set difference of two :py:class:`DataFrame`. + """Calculate the set difference of two `DataFrame`. Returns rows that are in this DataFrame but not in ``other``. - The two :py:class:`DataFrame` must have exactly the same schema. + The two `DataFrame` must have exactly the same schema. Args: other: DataFrame to calculate exception with. @@ -1386,10 +1398,11 @@ def except_all(self, other: DataFrame, distinct: bool = False) -> DataFrame: return DataFrame(self.df.except_all(other.df, distinct)) def union_by_name(self, other: DataFrame, distinct: bool = False) -> DataFrame: - """Union two :py:class:`DataFrame` matching columns by name. + """Union two `DataFrame` matching columns by name. - Unlike :py:meth:`union` which matches columns by position, this method - matches columns by their names, allowing DataFrames with different + Unlike :meth:`~datafusion.dataframe.DataFrame.union` which matches + columns by position, this method matches columns by their names, + allowing DataFrames with different column orders to be combined. Args: @@ -1460,7 +1473,8 @@ def sort_by(self, *exprs: Expr | str) -> DataFrame: This is a convenience method that sorts the DataFrame by the given expressions in ascending order with nulls last. For more control over - sort direction and null ordering, use :py:meth:`sort` instead. + sort direction and null ordering, use + :meth:`~datafusion.dataframe.DataFrame.sort` instead. Args: exprs: Expressions or column names to sort by. @@ -1485,7 +1499,7 @@ def write_csv( with_header: bool = False, write_options: DataFrameWriteOptions | None = None, ) -> None: - """Execute the :py:class:`DataFrame` and write the results to a CSV file. + """Execute the `DataFrame` and write the results to a CSV file. Args: path: Path of the CSV file to write. @@ -1531,7 +1545,7 @@ def write_parquet( compression_level: int | None = None, write_options: DataFrameWriteOptions | None = None, ) -> None: - """Execute the :py:class:`DataFrame` and write the results to a Parquet file. + """Execute the `DataFrame` and write the results to a Parquet file. Available compression types are: @@ -1586,7 +1600,7 @@ def write_parquet_with_options( options: ParquetWriterOptions, write_options: DataFrameWriteOptions | None = None, ) -> None: - """Execute the :py:class:`DataFrame` and write the results to a Parquet file. + """Execute the `DataFrame` and write the results to a Parquet file. Allows advanced writer options to be set with `ParquetWriterOptions`. @@ -1645,7 +1659,7 @@ def write_json( path: str | pathlib.Path, write_options: DataFrameWriteOptions | None = None, ) -> None: - """Execute the :py:class:`DataFrame` and write the results to a JSON file. + """Execute the `DataFrame` and write the results to a JSON file. Args: path: Path of the JSON file to write. @@ -1659,7 +1673,7 @@ def write_json( def write_table( self, table_name: str, write_options: DataFrameWriteOptions | None = None ) -> None: - """Execute the :py:class:`DataFrame` and write the results to a table. + """Execute the `DataFrame` and write the results to a table. The table must be registered with the session to perform this operation. Not all table providers support writing operations. See the individual @@ -1671,7 +1685,7 @@ def write_table( self.df.write_table(table_name, raw_write_options) def to_arrow_table(self) -> pa.Table: - """Execute the :py:class:`DataFrame` and convert it into an Arrow Table. + """Execute the `DataFrame` and convert it into an Arrow Table. Returns: Arrow Table. @@ -1696,7 +1710,7 @@ def execute_stream_partitioned(self) -> list[RecordBatchStream]: return [RecordBatchStream(rbs) for rbs in streams] def to_pandas(self) -> pd.DataFrame: - """Execute the :py:class:`DataFrame` and convert it into a Pandas DataFrame. + """Execute the `DataFrame` and convert it into a Pandas DataFrame. Returns: Pandas DataFrame. @@ -1704,7 +1718,7 @@ def to_pandas(self) -> pd.DataFrame: return self.df.to_pandas() def to_pylist(self) -> list[dict[str, Any]]: - """Execute the :py:class:`DataFrame` and convert it into a list of dictionaries. + """Execute the `DataFrame` and convert it into a list of dictionaries. Returns: List of dictionaries. @@ -1712,7 +1726,7 @@ def to_pylist(self) -> list[dict[str, Any]]: return self.df.to_pylist() def to_pydict(self) -> dict[str, list[Any]]: - """Execute the :py:class:`DataFrame` and convert it into a dictionary of lists. + """Execute the `DataFrame` and convert it into a dictionary of lists. Returns: Dictionary of lists. @@ -1720,7 +1734,7 @@ def to_pydict(self) -> dict[str, list[Any]]: return self.df.to_pydict() def to_polars(self) -> pl.DataFrame: - """Execute the :py:class:`DataFrame` and convert it into a Polars DataFrame. + """Execute the `DataFrame` and convert it into a Polars DataFrame. Returns: Polars DataFrame. @@ -1728,7 +1742,7 @@ def to_polars(self) -> pl.DataFrame: return self.df.to_polars() def count(self) -> int: - """Return the total number of rows in this :py:class:`DataFrame`. + """Return the total number of rows in this `DataFrame`. Note that this method will actually run a plan to calculate the count, which may be slow for large or complicated DataFrames. @@ -1790,7 +1804,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: supported through this interface. Args: - requested_schema: Either a :py:class:`pyarrow.Schema` or an Arrow C + requested_schema: Either a :class:`~pyarrow.Schema` or an Arrow C Schema capsule (``PyCapsule``) produced by ``schema._export_to_c_capsule()``. The DataFrame will attempt to align its output with the fields and order specified by this schema. diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index fd2da99f0..f74697f3e 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -32,6 +32,19 @@ from collections.abc import Callable +__all__ = [ + "CellFormatter", + "DataFrameHtmlFormatter", + "DefaultStyleProvider", + "FormatterManager", + "StyleProvider", + "configure_formatter", + "get_formatter", + "reset_formatter", + "set_formatter", +] + + def _validate_positive_int(value: Any, param_name: str) -> None: """Validate that a parameter is a positive integer. @@ -218,12 +231,12 @@ class DataFrameHtmlFormatter: max_rows: Maximum number of rows to display in repr output repr_rows: Deprecated alias for max_rows enable_cell_expansion: Whether to add expand/collapse buttons for long cell - values + values custom_css: Additional CSS to include in the HTML output show_truncation_message: Whether to display a message when data is truncated style_provider: Custom provider for cell and header styles use_shared_styles: Whether to load styles and scripts only once per notebook - session + session """ def __init__( @@ -343,8 +356,9 @@ def repr_rows(self) -> int: """Get the maximum number of rows (deprecated name). .. deprecated:: - Use :attr:`max_rows` instead. This property is provided for - backward compatibility. + Use + :attr:`~datafusion.dataframe_formatter.DataFrameHtmlFormatter.max_rows` + instead. This property is provided for backward compatibility. Returns: The maximum number of rows to display @@ -356,8 +370,9 @@ def repr_rows(self, value: int) -> None: """Set the maximum number of rows using deprecated name. .. deprecated:: - Use :attr:`max_rows` setter instead. This property is provided for - backward compatibility. + Use the + :attr:`~datafusion.dataframe_formatter.DataFrameHtmlFormatter.max_rows` + setter instead. This property is provided for backward compatibility. Args: value: The maximum number of rows @@ -747,7 +762,7 @@ def get_formatter() -> DataFrameHtmlFormatter: Returns: The global HTML formatter instance - Example: + Examples: >>> from datafusion.dataframe_formatter import get_formatter >>> formatter = get_formatter() >>> formatter.max_cell_length = 50 # Increase cell length @@ -761,7 +776,7 @@ def set_formatter(formatter: DataFrameHtmlFormatter) -> None: Args: formatter: The formatter instance to use globally - Example: + Examples: >>> from datafusion.dataframe_formatter import get_formatter, set_formatter >>> custom_formatter = DataFrameHtmlFormatter(max_cell_length=100) >>> set_formatter(custom_formatter) @@ -782,7 +797,7 @@ def configure_formatter(**kwargs: Any) -> None: Raises: ValueError: If any invalid parameters are provided - Example: + Examples: >>> from datafusion.dataframe_formatter import configure_formatter >>> configure_formatter( ... max_cell_length=50, @@ -826,7 +841,7 @@ def reset_formatter() -> None: This function creates a new formatter with default configuration and sets it as the global formatter for all DataFrames. - Example: + Examples: >>> from datafusion.dataframe_formatter import reset_formatter >>> reset_formatter() # Reset formatter to default settings """ diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py index 4fdbdc5d4..e9682ab83 100644 --- a/python/datafusion/expr.py +++ b/python/datafusion/expr.py @@ -15,21 +15,21 @@ # specific language governing permissions and limitations # under the License. -""":py:class:`Expr` — the logical expression type used to build DataFusion queries. +"""`Expr` — the logical expression type used to build DataFusion queries. -An :py:class:`Expr` represents a computation over columns or literals: a +An :class:`~datafusion.expr.Expr` represents a computation over columns or literals: a column reference (``col("a")``), a literal (``lit(5)``), an operator combination (``col("a") + lit(1)``), or the output of a function from -:py:mod:`datafusion.functions`. Expressions are passed to -:py:class:`~datafusion.dataframe.DataFrame` methods such as -:py:meth:`~datafusion.dataframe.DataFrame.select`, -:py:meth:`~datafusion.dataframe.DataFrame.filter`, -:py:meth:`~datafusion.dataframe.DataFrame.aggregate`, and -:py:meth:`~datafusion.dataframe.DataFrame.sort`. +:mod:`~datafusion.functions`. Expressions are passed to +:class:`~datafusion.dataframe.DataFrame` methods such as +:meth:`~datafusion.dataframe.DataFrame.select`, +:meth:`~datafusion.dataframe.DataFrame.filter`, +:meth:`~datafusion.dataframe.DataFrame.aggregate`, and +:meth:`~datafusion.dataframe.DataFrame.sort`. Convenience constructors are re-exported at the package level: -:py:func:`datafusion.col` / :py:func:`datafusion.column` for column references -and :py:func:`datafusion.lit` / :py:func:`datafusion.literal` for scalar +:func:`~datafusion.col.col` / :func:`~datafusion.col.column` for column references +and :func:`~datafusion.lit` / :func:`~datafusion.literal` for scalar literals. Examples: @@ -38,7 +38,7 @@ >>> df.select((col("a") * lit(10)).alias("ten_a")).to_pydict() {'ten_a': [10, 20, 30]} -See :ref:`expressions` in the online documentation for details on available +See expressions in the online documentation for details on available operators and helpers. """ @@ -261,12 +261,12 @@ def ensure_expr(value: Expr | Any) -> expr_internal.Expr: """Return the internal expression from ``Expr`` or raise ``TypeError``. - This helper rejects plain strings and other non-:class:`Expr` values so - higher level APIs consistently require explicit :func:`~datafusion.col` or + This helper rejects plain strings and other non-`Expr` values so + higher level APIs consistently require explicit :func:`~datafusion.col.col` or :func:`~datafusion.lit` expressions. See Also: - :func:`coerce_to_expr` — the opposite behavior: *wraps* non-``Expr`` + `coerce_to_expr` — the opposite behavior: *wraps* non-``Expr`` values as literals instead of rejecting them. Args: @@ -276,7 +276,7 @@ def ensure_expr(value: Expr | Any) -> expr_internal.Expr: The internal expression representation. Raises: - TypeError: If ``value`` is not an instance of :class:`Expr`. + TypeError: If ``value`` is not an instance of :class:`~datafusion.expr.Expr`. """ if not isinstance(value, Expr): raise TypeError(EXPR_TYPE_ERROR) @@ -295,7 +295,7 @@ def ensure_expr_list( A flat list of raw expressions. Raises: - TypeError: If any item is not an instance of :class:`Expr`. + TypeError: If any item is not an instance of :class:`~datafusion.expr.Expr`. """ def _iter( @@ -316,9 +316,9 @@ def _iter( def coerce_to_expr(value: Any) -> Expr: """Coerce a native Python value to an ``Expr`` literal, passing ``Expr`` through. - This is the complement of :func:`ensure_expr`: where ``ensure_expr`` + This is the complement of :func:`~ensure_expr`: where ``ensure_expr`` *rejects* non-``Expr`` values, ``coerce_to_expr`` *wraps* them via - :meth:`Expr.literal` so that functions can accept native Python types + `literal` so that functions can accept native Python types (``int``, ``float``, ``str``, ``bool``, etc.) alongside ``Expr``. Args: @@ -335,7 +335,7 @@ def coerce_to_expr(value: Any) -> Expr: def coerce_to_expr_or_none(value: Any | None) -> Expr | None: """Coerce a value to ``Expr`` or pass ``None`` through unchanged. - Same as :func:`coerce_to_expr` but accepts ``None`` for optional parameters. + Same as `coerce_to_expr` but accepts ``None`` for optional parameters. Args: value: An ``Expr`` instance, a Python literal to wrap, or ``None``. @@ -358,7 +358,7 @@ def _to_raw_expr(value: Expr | str) -> expr_internal.Expr: The internal :class:`~datafusion._internal.expr.Expr` representation. Raises: - TypeError: If ``value`` is neither an :class:`Expr` nor ``str``. + TypeError: If ``value`` is neither an `Expr` nor ``str``. """ if isinstance(value, str): return Expr.column(value).expr @@ -411,7 +411,7 @@ class Expr: # noqa: PLW1641 """Expression object. Expressions are one of the core concepts in DataFusion. See - :ref:`Expressions` in the online documentation for more information. + Expressions in the online documentation for more information. """ def __init__(self, expr: expr_internal.RawExpr) -> None: @@ -443,19 +443,20 @@ def variant_name(self) -> str: def to_bytes(self, ctx: SessionContext | None = None) -> bytes: """Serialize this expression to bytes for shipping to another process. - Use this — or :func:`pickle.dumps` — to send an expression to a + Use this — or :func:`~pickle.dumps` — to send an expression to a worker process for distributed evaluation. When ``ctx`` is supplied, encoding routes through that session's - installed :class:`LogicalExtensionCodec` (so settings like - :meth:`SessionContext.with_python_udf_inlining` take effect). + installed logical extension codec (set via + :meth:`~datafusion.context.SessionContext.with_logical_extension_codec`), + so settings like `with_python_udf_inlining` take effect. When ``ctx`` is ``None``, the default codec is used (Python UDF inlining on, no user-installed extension codec). Built-in functions travel inside the returned bytes. Python UDFs (scalar, aggregate, window) also inline by default, so the worker does not need to pre-register them; when the encoding session has - :meth:`SessionContext.with_python_udf_inlining` set to ``False``, + `with_python_udf_inlining` set to ``False``, Python UDFs travel by name only and must be registered on the worker. UDFs imported via the FFI capsule protocol always travel by name only and must be registered on the worker. @@ -463,8 +464,8 @@ def to_bytes(self, ctx: SessionContext | None = None) -> bytes: .. warning:: Security Bytes returned here may embed a cloudpickled Python callable (when the expression carries a Python UDF). - Reconstructing them via :meth:`from_bytes` or - :func:`pickle.loads` executes arbitrary Python on the + Reconstructing them via :meth:`~datafusion.expr.Expr.from_bytes` or + :func:`~pickle.loads` executes arbitrary Python on the receiver. Only accept payloads from trusted sources. .. warning:: Portability @@ -472,7 +473,7 @@ def to_bytes(self, ctx: SessionContext | None = None) -> bytes: stable across Python minor versions**. A payload produced on Python 3.11 will fail to load on Python 3.12. The wire format stamps the sender's ``(major, minor)``; - :meth:`from_bytes` raises a :class:`ValueError` naming + `from_bytes` raises a :exc:`~ValueError` naming both versions on mismatch. cloudpickle captures the UDF callable **by value** — @@ -526,14 +527,14 @@ def double(x): def from_bytes(cls, buf: bytes, ctx: SessionContext | None = None) -> Expr: """Reconstruct an expression from serialized bytes. - Accepts output of :meth:`to_bytes` or :func:`pickle.dumps`. - ``ctx`` is the :class:`SessionContext` used to resolve any + Accepts output of `to_bytes` or :func:`~pickle.dumps`. + ``ctx`` is the `SessionContext` used to resolve any function references that travel by name (e.g. FFI UDFs, or Python UDFs sent with inlining disabled via - :meth:`SessionContext.with_python_udf_inlining`). When + `with_python_udf_inlining`). When ``ctx`` is ``None`` the worker context installed via - :func:`datafusion.ipc.set_worker_ctx` is consulted; if no worker - context is installed, the global :class:`SessionContext` is used + :func:`~datafusion.ipc.set_worker_ctx` is consulted; if no worker + context is installed, the global `SessionContext` is used (sufficient for built-ins and Python UDFs, plus any UDFs registered on the global context). @@ -547,9 +548,9 @@ def from_bytes(cls, buf: bytes, ctx: SessionContext | None = None) -> Expr: cloudpickle payloads are **not portable across Python minor versions**. The wire format stamps the sender's ``(major, minor)``; if it does not match the current - interpreter, this method raises :class:`ValueError` + interpreter, this method raises :exc:`~ValueError` naming both versions. Modules the UDF imports must also - be importable on the receiver — see :meth:`to_bytes` for + be importable on the receiver — see `to_bytes` for by-value vs. by-reference details. Examples: @@ -567,17 +568,17 @@ def __reduce__(self) -> tuple[Callable[[bytes], Expr], tuple[bytes]]: """Pickle protocol hook. Lets expressions be shipped to worker processes via - :func:`pickle.dumps` / :func:`pickle.loads`. Built-in functions + :func:`~pickle.dumps` / :func:`~pickle.loads`. Built-in functions and Python UDFs (scalar, aggregate, window) travel inside the pickle bytes; only FFI-capsule UDFs require pre-registration on - the worker. The worker's :class:`SessionContext` for resolving + the worker. The worker's `SessionContext` for resolving those references is looked up via - :func:`datafusion.ipc.set_worker_ctx`, falling back to the - global :class:`SessionContext` if none has been installed on + :func:`~datafusion.ipc.set_worker_ctx`, falling back to the + global `SessionContext` if none has been installed on the worker. .. warning:: Security - :func:`pickle.loads` on the returned tuple executes + :func:`~pickle.loads` on the returned tuple executes arbitrary Python on the receiver, including any cloudpickled UDF callable embedded in the payload. Only unpickle expressions from trusted sources. @@ -585,7 +586,7 @@ def __reduce__(self) -> tuple[Callable[[bytes], Expr], tuple[bytes]]: .. warning:: Portability Sender and receiver must run the same Python ``(major, minor)`` version; cloudpickle bytecode is not - portable across minor versions. See :meth:`to_bytes` for + portable across minor versions. See `to_bytes` for details on what travels by value vs. by reference. Examples: @@ -596,17 +597,17 @@ def __reduce__(self) -> tuple[Callable[[bytes], Expr], tuple[bytes]]: 'a * Int64(2)' The encoding side honors a driver-side sender context installed - via :func:`datafusion.ipc.set_sender_ctx` — that is how - :meth:`SessionContext.with_python_udf_inlining` propagates + via :func:`~datafusion.ipc.set_sender_ctx` — that is how + `with_python_udf_inlining` propagates through ``pickle.dumps``. The sender context is read by - ``__reduce__``, so :func:`copy.copy` and :func:`copy.deepcopy` + ``__reduce__``, so :func:`~copy.copy` and :func:`~copy.deepcopy` — which also go through ``__reduce__`` — pick it up too. """ return (Expr._reconstruct, (self.to_bytes(get_sender_ctx()),)) @classmethod def _reconstruct(cls, proto_bytes: bytes) -> Expr: - """Internal entry point used by :meth:`__reduce__` on unpickle. + """Internal entry point used by :func:`~__reduce__` on unpickle. Examples: >>> from datafusion import Expr, col, lit @@ -691,11 +692,11 @@ def __getitem__(self, key: str | int) -> Expr: If ``key`` is a string, returns the subfield of the struct. If ``key`` is an integer, retrieves the element in the array. Note that the element index begins at ``0``, unlike - :py:func:`~datafusion.functions.array_element` which begins at ``1``. + :func:`~datafusion.functions.array_element` which begins at ``1``. If ``key`` is a slice, returns an array that contains a slice of the original array. Similar to integer indexing, this follows Python convention where the index begins at ``0`` unlike - :py:func:`~datafusion.functions.array_slice` which begins at ``1``. + :func:`~datafusion.functions.array_slice` which begins at ``1``. """ if isinstance(key, int): return Expr( @@ -848,7 +849,7 @@ def alias(self, name: str, metadata: dict[str, str] | None = None) -> Expr: return Expr(self.expr.alias(name, metadata)) def sort(self, ascending: bool = True, nulls_first: bool = True) -> SortExpr: - """Creates a sort :py:class:`Expr` from an existing :py:class:`Expr`. + """Creates a sort `Expr` from an existing `Expr`. Args: ascending: If true, sort in ascending order. @@ -959,7 +960,7 @@ def column_name(self, plan: LogicalPlan) -> str: def order_by(self, *exprs: Expr | SortExpr) -> ExprFuncBuilder: """Set the ordering for a window or aggregate function. - This function will create an :py:class:`ExprFuncBuilder` that can be used to + This function will create an `ExprFuncBuilder` that can be used to set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ @@ -968,7 +969,7 @@ def order_by(self, *exprs: Expr | SortExpr) -> ExprFuncBuilder: def filter(self, filter: Expr) -> ExprFuncBuilder: """Filter an aggregate function. - This function will create an :py:class:`ExprFuncBuilder` that can be used to + This function will create an `ExprFuncBuilder` that can be used to set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ @@ -977,7 +978,7 @@ def filter(self, filter: Expr) -> ExprFuncBuilder: def distinct(self) -> ExprFuncBuilder: """Only evaluate distinct values for an aggregate function. - This function will create an :py:class:`ExprFuncBuilder` that can be used to + This function will create an `ExprFuncBuilder` that can be used to set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ @@ -986,7 +987,7 @@ def distinct(self) -> ExprFuncBuilder: def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: """Set the treatment for ``null`` values for a window or aggregate function. - This function will create an :py:class:`ExprFuncBuilder` that can be used to + This function will create an `ExprFuncBuilder` that can be used to set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ @@ -995,7 +996,7 @@ def null_treatment(self, null_treatment: NullTreatment) -> ExprFuncBuilder: def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: """Set the partitioning for a window function. - This function will create an :py:class:`ExprFuncBuilder` that can be used to + This function will create an `ExprFuncBuilder` that can be used to set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ @@ -1004,7 +1005,7 @@ def partition_by(self, *partition_by: Expr) -> ExprFuncBuilder: def window_frame(self, window_frame: WindowFrame) -> ExprFuncBuilder: """Set the frame fora window function. - This function will create an :py:class:`ExprFuncBuilder` that can be used to + This function will create an `ExprFuncBuilder` that can be used to set parameters for either window or aggregate functions. If used on any other type of expression, an error will be generated when ``build()`` is called. """ @@ -1125,7 +1126,7 @@ def initcap(self) -> Expr: def list_distinct(self) -> Expr: """Returns distinct values from the array after removing duplicates. - This is an alias for :py:func:`array_distinct`. + This is an alias for :func:`~datafusion.functions.array_distinct`. """ from . import functions as F @@ -1308,7 +1309,7 @@ def atanh(self) -> Expr: def list_dims(self) -> Expr: """Returns an array of the array's dimensions. - This is an alias for :py:func:`array_dims`. + This is an alias for :func:`~datafusion.functions.array_dims`. """ from . import functions as F @@ -1347,7 +1348,7 @@ def ceil(self) -> Expr: def list_length(self) -> Expr: """Returns the length of the array. - This is an alias for :py:func:`array_length`. + This is an alias for :func:`~datafusion.functions.array_length`. """ from . import functions as F @@ -1404,7 +1405,7 @@ def char_length(self) -> Expr: def list_ndims(self) -> Expr: """Returns the number of dimensions of the array. - This is an alias for :py:func:`array_ndims`. + This is an alias for :func:`~datafusion.functions.array_ndims`. """ from . import functions as F @@ -1429,7 +1430,7 @@ def sinh(self) -> Expr: return F.sinh(self) def empty(self) -> Expr: - """This is an alias for :py:func:`array_empty`.""" + """This is an alias for :func:`~datafusion.functions.array_empty`.""" from . import functions as F return F.empty(self) @@ -1571,7 +1572,7 @@ def get_upper_bound(self) -> WindowFrameBound: class WindowFrameBound: """Defines a single window frame bound. - :py:class:`WindowFrame` typically requires a start and end bound. + `WindowFrame` typically requires a start and end bound. """ def __init__(self, frame_bound: expr_internal.WindowFrameBound) -> None: @@ -1620,7 +1621,7 @@ def __init__(self, case_builder: expr_internal.CaseBuilder) -> None: """Constructs a case builder. This is not typically called by the end user directly. See - :py:func:`datafusion.functions.case` instead. + :func:`~datafusion.functions.case` instead. """ self.case_builder = case_builder @@ -1671,12 +1672,12 @@ class GroupingSet: """Factory for creating grouping set expressions. Grouping sets control how - :py:meth:`~datafusion.dataframe.DataFrame.aggregate` groups rows. + :meth:`~datafusion.dataframe.DataFrame.aggregate` groups rows. Instead of a single ``GROUP BY``, they produce multiple grouping levels in one pass — subtotals, cross-tabulations, or arbitrary column subsets. - Use :py:func:`~datafusion.functions.grouping` in the aggregate list + Use :func:`~datafusion.functions.grouping` in the aggregate list to tell which columns are aggregated across in each result row. """ @@ -1707,8 +1708,9 @@ def rollup(*exprs: Expr | str) -> Expr: [30, 30, 60] See Also: - :py:meth:`cube`, :py:meth:`grouping_sets`, - :py:func:`~datafusion.functions.grouping` + :meth:`~datafusion.expr.GroupingSet.cube`, + :meth:`~datafusion.expr.GroupingSet.grouping_sets`, + :func:`~datafusion.functions.grouping` """ args = [_to_raw_expr(e) for e in exprs] return Expr(expr_internal.GroupingSet.rollup(*args)) @@ -1729,7 +1731,7 @@ def cube(*exprs: Expr | str) -> Expr: Examples: With a single column, ``cube`` behaves identically to - :py:meth:`rollup`: + :meth:`~datafusion.expr.GroupingSet.rollup`: >>> from datafusion.expr import GroupingSet >>> ctx = dfn.SessionContext() @@ -1743,8 +1745,9 @@ def cube(*exprs: Expr | str) -> Expr: [30, 30, 60] See Also: - :py:meth:`rollup`, :py:meth:`grouping_sets`, - :py:func:`~datafusion.functions.grouping` + :meth:`~datafusion.expr.GroupingSet.rollup`, + :meth:`~datafusion.expr.GroupingSet.grouping_sets`, + :func:`~datafusion.functions.grouping` """ args = [_to_raw_expr(e) for e in exprs] return Expr(expr_internal.GroupingSet.cube(*args)) @@ -1786,8 +1789,9 @@ def grouping_sets(*expr_lists: list[Expr | str]) -> Expr: [3, 3, 4, 2] See Also: - :py:meth:`rollup`, :py:meth:`cube`, - :py:func:`~datafusion.functions.grouping` + :meth:`~datafusion.expr.GroupingSet.rollup`, + :meth:`~datafusion.expr.GroupingSet.cube`, + :func:`~datafusion.functions.grouping` """ raw_lists = [[_to_raw_expr(e) for e in lst] for lst in expr_lists] return Expr(expr_internal.GroupingSet.grouping_sets(*raw_lists)) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index c8f07497d..a86f6d07c 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -14,15 +14,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Scalar, aggregate, and window functions for :py:class:`~datafusion.expr.Expr`. +"""Scalar, aggregate, and window functions for :class:`~datafusion.expr.Expr`. -Each function returns an :py:class:`~datafusion.expr.Expr` that can be combined +Each function returns an :class:`~datafusion.expr.Expr` that can be combined with other expressions and passed to -:py:class:`~datafusion.dataframe.DataFrame` methods such as -:py:meth:`~datafusion.dataframe.DataFrame.select`, -:py:meth:`~datafusion.dataframe.DataFrame.filter`, -:py:meth:`~datafusion.dataframe.DataFrame.aggregate`, and -:py:meth:`~datafusion.dataframe.DataFrame.window`. The module is conventionally +:class:`~datafusion.dataframe.DataFrame` methods such as +:meth:`~datafusion.dataframe.DataFrame.select`, +:meth:`~datafusion.dataframe.DataFrame.filter`, +:meth:`~datafusion.dataframe.DataFrame.aggregate`, and +:meth:`~datafusion.dataframe.DataFrame.window`. The module is conventionally imported as ``F`` so calls read like ``F.sum(col("price"))``. Examples: @@ -32,7 +32,7 @@ >>> df.aggregate([], [F.sum(col("a")).alias("total")]).to_pydict() {'total': [10]} -See :ref:`aggregation` and :ref:`window_functions` in the online documentation +See aggregation and window_functions in the online documentation for categorized catalogs of aggregate and window functions. """ @@ -449,7 +449,7 @@ def array_join(expr: Expr, delimiter: Expr | str) -> Expr: """Converts each element to its text representation. See Also: - This is an alias for :py:func:`array_to_string`. + This is an alias for :func:`~datafusion.functions.array_to_string`. """ return array_to_string(expr, delimiter) @@ -458,7 +458,7 @@ def list_to_string(expr: Expr, delimiter: Expr | str) -> Expr: """Converts each element to its text representation. See Also: - This is an alias for :py:func:`array_to_string`. + This is an alias for :func:`~datafusion.functions.array_to_string`. """ return array_to_string(expr, delimiter) @@ -467,7 +467,7 @@ def list_join(expr: Expr, delimiter: Expr | str) -> Expr: """Converts each element to its text representation. See Also: - This is an alias for :py:func:`array_to_string`. + This is an alias for :func:`~datafusion.functions.array_to_string`. """ return array_to_string(expr, delimiter) @@ -475,9 +475,9 @@ def list_join(expr: Expr, delimiter: Expr | str) -> Expr: def lambda_var(name: str) -> Expr: """Create an unresolved reference to a lambda parameter by ``name``. - Use this inside the body passed to :py:func:`lambda_` to refer to one of the + Use this inside the body passed to `lambda_` to refer to one of the lambda's parameters. The owning higher-order function (such as - :py:func:`array_transform`) binds the variable to a concrete element type + `array_transform`) binds the variable to a concrete element type during query planning. Examples: @@ -490,7 +490,7 @@ def lambda_var(name: str) -> Expr: [2, 4, 6] See Also: - :py:func:`lambda_`, :py:func:`array_transform`, :py:func:`array_any_match`. + `lambda_`, `array_transform`, `array_any_match`. """ return Expr(f.lambda_var(name)) @@ -500,13 +500,13 @@ def lambda_(params: list[str], body: Expr) -> Expr: This is the explicit form of building a lambda. Most callers can instead pass a Python callable directly to a higher-order function such as - :py:func:`array_transform`, which builds the lambda automatically. Reach for + `array_transform`, which builds the lambda automatically. Reach for ``lambda_`` when you want explicit control over the parameter names. Args: params: Ordered lambda parameter names. body: Body expression that references the parameters via - :py:func:`lambda_var`. + :func:`~datafusion.functions.lambda_var`. Examples: >>> ctx = dfn.SessionContext() @@ -518,7 +518,7 @@ def lambda_(params: list[str], body: Expr) -> Expr: [2, 4, 6] See Also: - :py:func:`lambda_var`, :py:func:`array_transform`, :py:func:`array_any_match`. + `lambda_var`, `array_transform`, `array_any_match`. """ return Expr(f.lambda_(params, body.expr)) @@ -526,9 +526,9 @@ def lambda_(params: list[str], body: Expr) -> Expr: def _to_lambda(fn: Expr | Callable[..., Any]) -> Expr: """Coerce ``fn`` to a lambda ``Expr``. - Accepts either an ``Expr`` produced by :py:func:`lambda_` (returned + Accepts either an ``Expr`` produced by `lambda_` (returned unchanged) or a Python callable. A callable is introspected for its - parameter names; those names become :py:func:`lambda_var` references passed + parameter names; those names become `lambda_var` references passed positionally into the callable, and its return value (coerced to an ``Expr``) becomes the lambda body. """ @@ -550,7 +550,7 @@ def array_transform(array: Expr, transform: Expr | Callable[..., Any]) -> Expr: ``transform`` may be a Python callable, which is converted to a lambda automatically (its parameter names become the lambda parameters), or an - explicit lambda built with :py:func:`lambda_`. + explicit lambda built with :func:`~datafusion.functions.lambda_`. Examples: Using a Python callable: @@ -562,7 +562,7 @@ def array_transform(array: Expr, transform: Expr | Callable[..., Any]) -> Expr: ... ).collect_column("d")[0].as_py() [2, 4, 6] - Using an explicit lambda built with :py:func:`lambda_`: + Using an explicit lambda built with :func:`~datafusion.functions.lambda_`: >>> double_fn = F.lambda_(["v"], F.lambda_var("v") * lit(2)) >>> df.select( @@ -571,7 +571,7 @@ def array_transform(array: Expr, transform: Expr | Callable[..., Any]) -> Expr: [2, 4, 6] See Also: - :py:func:`array_any_match`, :py:func:`lambda_`. + `array_any_match`, :func:`~datafusion.functions.lambda_`. """ return Expr(f.array_transform(array.expr, _to_lambda(transform).expr)) @@ -580,7 +580,7 @@ def list_transform(array: Expr, transform: Expr | Callable[..., Any]) -> Expr: """Transform each element of a list with a lambda. See Also: - This is an alias for :py:func:`array_transform`. + This is an alias for :func:`~datafusion.functions.array_transform`. """ return array_transform(array, transform) @@ -589,7 +589,7 @@ def array_any_match(array: Expr, predicate: Expr | Callable[..., Any]) -> Expr: """Return ``True`` if any element of ``array`` satisfies ``predicate``. ``predicate`` may be a Python callable, converted to a lambda - automatically, or an explicit lambda built with :py:func:`lambda_`. It must + automatically, or an explicit lambda built with `lambda_`. It must return a boolean expression. Examples: @@ -602,7 +602,7 @@ def array_any_match(array: Expr, predicate: Expr | Callable[..., Any]) -> Expr: ... ).collect_column("m")[0].as_py() True - Using an explicit lambda built with :py:func:`lambda_`: + Using an explicit lambda built with :func:`~datafusion.functions.lambda_`: >>> predicate = F.lambda_(["v"], F.lambda_var("v") > lit(2)) >>> df.select( @@ -611,7 +611,7 @@ def array_any_match(array: Expr, predicate: Expr | Callable[..., Any]) -> Expr: True See Also: - :py:func:`array_transform`, :py:func:`lambda_`. + `array_transform`, :func:`~datafusion.functions.lambda_`. """ return Expr(f.array_any_match(array.expr, _to_lambda(predicate).expr)) @@ -620,7 +620,7 @@ def any_match(array: Expr, predicate: Expr | Callable[..., Any]) -> Expr: """Return ``True`` if any element of an array satisfies a predicate. See Also: - This is an alias for :py:func:`array_any_match`. + This is an alias for :func:`~datafusion.functions.array_any_match`. """ return array_any_match(array, predicate) @@ -629,7 +629,7 @@ def list_any_match(array: Expr, predicate: Expr | Callable[..., Any]) -> Expr: """Return ``True`` if any element of a list satisfies a predicate. See Also: - This is an alias for :py:func:`array_any_match`. + This is an alias for :func:`~datafusion.functions.array_any_match`. """ return array_any_match(array, predicate) @@ -638,7 +638,7 @@ def array_filter(array: Expr, predicate: Expr | Callable[..., Any]) -> Expr: """Keep the elements of ``array`` for which ``predicate`` is ``True``. ``predicate`` may be a Python callable, converted to a lambda - automatically, or an explicit lambda built with :py:func:`lambda_`. It must + automatically, or an explicit lambda built with `lambda_`. It must return a boolean expression. The result is a new array containing only the matching elements. @@ -652,7 +652,7 @@ def array_filter(array: Expr, predicate: Expr | Callable[..., Any]) -> Expr: ... ).collect_column("f")[0].as_py() [3, 4, 5] - Using an explicit lambda built with :py:func:`lambda_`: + Using an explicit lambda built with :func:`~datafusion.functions.lambda_`: >>> predicate = F.lambda_(["v"], F.lambda_var("v") > lit(2)) >>> df.select( @@ -661,7 +661,7 @@ def array_filter(array: Expr, predicate: Expr | Callable[..., Any]) -> Expr: [3, 4, 5] See Also: - :py:func:`array_transform`, :py:func:`array_any_match`, :py:func:`lambda_`. + `array_transform`, `array_any_match`, :func:`~datafusion.functions.lambda_`. """ return Expr(f.array_filter(array.expr, _to_lambda(predicate).expr)) @@ -670,7 +670,7 @@ def list_filter(array: Expr, predicate: Expr | Callable[..., Any]) -> Expr: """Keep the elements of a list for which a predicate is ``True``. See Also: - This is an alias for :py:func:`array_filter`. + This is an alias for :func:`~datafusion.functions.array_filter`. """ return array_filter(array, predicate) @@ -864,8 +864,8 @@ def count_star(filter: Expr | None = None) -> Expr: def case(expr: Expr) -> CaseBuilder: """Create a case expression. - Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the - expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for + Create a :class:`~datafusion.expr.CaseBuilder` to match cases for the + expression ``expr``. See :class:`~datafusion.expr.CaseBuilder` for detailed usage. Examples: @@ -883,8 +883,8 @@ def case(expr: Expr) -> CaseBuilder: def when(when: Expr, then: Expr) -> CaseBuilder: """Create a case expression that has no base expression. - Create a :py:class:`~datafusion.expr.CaseBuilder` to match cases for the - expression ``expr``. See :py:class:`~datafusion.expr.CaseBuilder` for + Create a :class:`~datafusion.expr.CaseBuilder` to match cases for the + expression ``expr``. See :class:`~datafusion.expr.CaseBuilder` for detailed usage. Examples: @@ -1315,7 +1315,7 @@ def ifnull(x: Expr, y: Expr) -> Expr: y: Fallback expression to return when ``x`` is NULL. See Also: - This is an alias for :py:func:`nvl`. + This is an alias for :func:`~datafusion.functions.nvl`. """ return nvl(x, y) @@ -1340,7 +1340,7 @@ def instr(string: Expr, substring: Expr | str) -> Expr: """Finds the position from where the ``substring`` matches the ``string``. See Also: - This is an alias for :py:func:`strpos`. + This is an alias for :func:`~datafusion.functions.strpos`. """ return strpos(string, substring) @@ -1669,7 +1669,7 @@ def position(string: Expr, substring: Expr | str) -> Expr: """Finds the position from where the ``substring`` matches the ``string``. See Also: - This is an alias for :py:func:`strpos`. + This is an alias for :func:`~datafusion.functions.strpos`. """ return strpos(string, substring) @@ -1694,7 +1694,7 @@ def pow(base: Expr, exponent: Expr | int | float) -> Expr: # noqa: PYI041 """Returns ``base`` raised to the power of ``exponent``. See Also: - This is an alias of :py:func:`power`. + This is an alias of :func:`~datafusion.functions.power`. """ return power(base, exponent) @@ -2329,7 +2329,7 @@ def current_timestamp() -> Expr: """Returns the current timestamp in nanoseconds. See Also: - This is an alias for :py:func:`now`. + This is an alias for :func:`~datafusion.functions.now`. """ return now() @@ -2361,7 +2361,7 @@ def date_format(arg: Expr, formatter: Expr | str) -> Expr: """Returns a string representation of a date, time, timestamp or duration. See Also: - This is an alias for :py:func:`to_char`. + This is an alias for :func:`~datafusion.functions.to_char`. """ return to_char(arg, formatter) @@ -2446,7 +2446,7 @@ def to_timestamp(arg: Expr, *formatters: Expr) -> Expr: def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in milliseconds. - See :py:func:`to_timestamp` for a description on how to use formatters. + See `to_timestamp` for a description on how to use formatters. Examples: >>> ctx = dfn.SessionContext() @@ -2465,7 +2465,7 @@ def to_timestamp_millis(arg: Expr, *formatters: Expr) -> Expr: def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in microseconds. - See :py:func:`to_timestamp` for a description on how to use formatters. + See `to_timestamp` for a description on how to use formatters. Examples: >>> ctx = dfn.SessionContext() @@ -2484,7 +2484,7 @@ def to_timestamp_micros(arg: Expr, *formatters: Expr) -> Expr: def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in nanoseconds. - See :py:func:`to_timestamp` for a description on how to use formatters. + See `to_timestamp` for a description on how to use formatters. Examples: >>> ctx = dfn.SessionContext() @@ -2503,7 +2503,7 @@ def to_timestamp_nanos(arg: Expr, *formatters: Expr) -> Expr: def to_timestamp_seconds(arg: Expr, *formatters: Expr) -> Expr: """Converts a string and optional formats to a ``Timestamp`` in seconds. - See :py:func:`to_timestamp` for a description on how to use formatters. + See `to_timestamp` for a description on how to use formatters. Examples: >>> ctx = dfn.SessionContext() @@ -2573,7 +2573,7 @@ def datepart(part: Expr | str, date: Expr) -> Expr: """Return a specified part of a date. See Also: - This is an alias for :py:func:`date_part`. + This is an alias for :func:`~datafusion.functions.date_part`. """ return date_part(part, date) @@ -2603,7 +2603,7 @@ def extract(part: Expr | str, date: Expr) -> Expr: """Extracts a subfield from the date. See Also: - This is an alias for :py:func:`date_part`. + This is an alias for :func:`~datafusion.functions.date_part`. """ return date_part(part, date) @@ -2634,7 +2634,7 @@ def datetrunc(part: Expr | str, date: Expr) -> Expr: """Truncates the date to a specified level of precision. See Also: - This is an alias for :py:func:`date_trunc`. + This is an alias for :func:`~datafusion.functions.date_trunc`. """ return date_trunc(part, date) @@ -2776,7 +2776,7 @@ def make_list(*args: Expr) -> Expr: """Returns an array using the specified input expressions. See Also: - This is an alias for :py:func:`make_array`. + This is an alias for :func:`~datafusion.functions.make_array`. """ return make_array(*args) @@ -2785,7 +2785,7 @@ def array(*args: Expr) -> Expr: """Returns an array using the specified input expressions. See Also: - This is an alias for :py:func:`make_array`. + This is an alias for :func:`~datafusion.functions.make_array`. """ return make_array(*args) @@ -2899,8 +2899,7 @@ def arrow_cast(expr: Expr, data_type: Expr | str | pa.DataType) -> Expr: """Casts an expression to a specified data type. The ``data_type`` can be a string, a ``pyarrow.DataType``, or an - ``Expr``. For simple types, :py:meth:`Expr.cast() - ` is more concise + ``Expr``. For simple types, `Expr.cast()` is more concise (e.g., ``col("a").cast(pa.float64())``). Use ``arrow_cast`` when you want to specify the target type as a string using DataFusion's type syntax, which can be more readable for complex types like @@ -2970,13 +2969,13 @@ def get_field(expr: Expr, *names: Expr | str) -> Expr: of nested struct/map fields in a single ``get_field`` call. For a single static-string name, ``expr["field"]`` is a convenient shorthand; use ``get_field`` when the field name is a dynamic - :py:class:`~datafusion.expr.Expr` or when traversing multiple levels at + :class:`~datafusion.expr.Expr` or when traversing multiple levels at once. Args: expr: The struct or map expression to read from. *names: One or more field names (``str``) or expressions - (:py:class:`~datafusion.expr.Expr`). + (:class:`~datafusion.expr.Expr`). Examples: Single-level lookup: @@ -3086,7 +3085,7 @@ def row(*args: Expr) -> Expr: """Returns a struct with the given arguments. See Also: - This is an alias for :py:func:`struct`. + This is an alias for :func:`~datafusion.functions.struct`. """ return struct(*args) @@ -3125,7 +3124,7 @@ def array_push_back(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. See Also: - This is an alias for :py:func:`array_append`. + This is an alias for :func:`~datafusion.functions.array_append`. """ return array_append(array, element) @@ -3134,7 +3133,7 @@ def list_append(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. See Also: - This is an alias for :py:func:`array_append`. + This is an alias for :func:`~datafusion.functions.array_append`. """ return array_append(array, element) @@ -3143,7 +3142,7 @@ def list_push_back(array: Expr, element: Expr) -> Expr: """Appends an element to the end of an array. See Also: - This is an alias for :py:func:`array_append`. + This is an alias for :func:`~datafusion.functions.array_append`. """ return array_append(array, element) @@ -3167,7 +3166,7 @@ def array_cat(*args: Expr) -> Expr: """Concatenates the input arrays. See Also: - This is an alias for :py:func:`array_concat`. + This is an alias for :func:`~datafusion.functions.array_concat`. """ return array_concat(*args) @@ -3208,7 +3207,7 @@ def list_cat(*args: Expr) -> Expr: """Concatenates the input arrays. See Also: - This is an alias for :py:func:`array_concat`, :py:func:`array_cat`. + This is an alias for `array_concat`, `array_cat`. """ return array_concat(*args) @@ -3217,7 +3216,7 @@ def list_concat(*args: Expr) -> Expr: """Concatenates the input arrays. See Also: - This is an alias for :py:func:`array_concat`, :py:func:`array_cat`. + This is an alias for `array_concat`, `array_cat`. """ return array_concat(*args) @@ -3226,7 +3225,7 @@ def list_distinct(array: Expr) -> Expr: """Returns distinct values from the array after removing duplicates. See Also: - This is an alias for :py:func:`array_distinct`. + This is an alias for :func:`~datafusion.functions.array_distinct`. """ return array_distinct(array) @@ -3235,7 +3234,7 @@ def list_dims(array: Expr) -> Expr: """Returns an array of the array's dimensions. See Also: - This is an alias for :py:func:`array_dims`. + This is an alias for :func:`~datafusion.functions.array_dims`. """ return array_dims(array) @@ -3272,7 +3271,7 @@ def list_empty(array: Expr) -> Expr: """Returns a boolean indicating whether the array is empty. See Also: - This is an alias for :py:func:`array_empty`. + This is an alias for :func:`~datafusion.functions.array_empty`. """ return array_empty(array) @@ -3281,7 +3280,7 @@ def array_extract(array: Expr, n: Expr | int) -> Expr: """Extracts the element with the index n from the array. See Also: - This is an alias for :py:func:`array_element`. + This is an alias for :func:`~datafusion.functions.array_element`. """ return array_element(array, n) @@ -3290,7 +3289,7 @@ def list_element(array: Expr, n: Expr | int) -> Expr: """Extracts the element with the index n from the array. See Also: - This is an alias for :py:func:`array_element`. + This is an alias for :func:`~datafusion.functions.array_element`. """ return array_element(array, n) @@ -3299,7 +3298,7 @@ def list_extract(array: Expr, n: Expr | int) -> Expr: """Extracts the element with the index n from the array. See Also: - This is an alias for :py:func:`array_element`. + This is an alias for :func:`~datafusion.functions.array_element`. """ return array_element(array, n) @@ -3321,7 +3320,7 @@ def list_length(array: Expr) -> Expr: """Returns the length of the array. See Also: - This is an alias for :py:func:`array_length`. + This is an alias for :func:`~datafusion.functions.array_length`. """ return array_length(array) @@ -3378,7 +3377,7 @@ def array_contains(array: Expr, element: Expr) -> Expr: """Returns true if the element appears in the array, otherwise false. See Also: - This is an alias for :py:func:`array_has`. + This is an alias for :func:`~datafusion.functions.array_has`. """ return array_has(array, element) @@ -3387,7 +3386,7 @@ def list_has(array: Expr, element: Expr) -> Expr: """Returns true if the element appears in the array, otherwise false. See Also: - This is an alias for :py:func:`array_has`. + This is an alias for :func:`~datafusion.functions.array_has`. """ return array_has(array, element) @@ -3396,7 +3395,7 @@ def list_has_all(first_array: Expr, second_array: Expr) -> Expr: """Determines if there is complete overlap ``second_array`` in ``first_array``. See Also: - This is an alias for :py:func:`array_has_all`. + This is an alias for :func:`~datafusion.functions.array_has_all`. """ return array_has_all(first_array, second_array) @@ -3405,7 +3404,7 @@ def list_has_any(first_array: Expr, second_array: Expr) -> Expr: """Determine if there is an overlap between ``first_array`` and ``second_array``. See Also: - This is an alias for :py:func:`array_has_any`. + This is an alias for :func:`~datafusion.functions.array_has_any`. """ return array_has_any(first_array, second_array) @@ -3414,7 +3413,7 @@ def arrays_overlap(first_array: Expr, second_array: Expr) -> Expr: """Returns true if any element appears in both arrays. See Also: - This is an alias for :py:func:`array_has_any`. + This is an alias for :func:`~datafusion.functions.array_has_any`. """ return array_has_any(first_array, second_array) @@ -3423,7 +3422,7 @@ def list_overlap(first_array: Expr, second_array: Expr) -> Expr: """Returns true if any element appears in both arrays. See Also: - This is an alias for :py:func:`array_has_any`. + This is an alias for :func:`~datafusion.functions.array_has_any`. """ return array_has_any(first_array, second_array) @@ -3432,7 +3431,7 @@ def list_contains(array: Expr, element: Expr) -> Expr: """Returns true if the element appears in the array, otherwise false. See Also: - This is an alias for :py:func:`array_has`. + This is an alias for :func:`~datafusion.functions.array_has`. """ return array_has(array, element) @@ -3467,7 +3466,7 @@ def array_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. See Also: - This is an alias for :py:func:`array_position`. + This is an alias for :func:`~datafusion.functions.array_position`. """ return array_position(array, element, index) @@ -3476,7 +3475,7 @@ def list_position(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. See Also: - This is an alias for :py:func:`array_position`. + This is an alias for :func:`~datafusion.functions.array_position`. """ return array_position(array, element, index) @@ -3485,7 +3484,7 @@ def list_indexof(array: Expr, element: Expr, index: int | None = 1) -> Expr: """Return the position of the first occurrence of ``element`` in ``array``. See Also: - This is an alias for :py:func:`array_position`. + This is an alias for :func:`~datafusion.functions.array_position`. """ return array_position(array, element, index) @@ -3508,7 +3507,7 @@ def list_positions(array: Expr, element: Expr) -> Expr: """Searches for an element in the array and returns all occurrences. See Also: - This is an alias for :py:func:`array_positions`. + This is an alias for :func:`~datafusion.functions.array_positions`. """ return array_positions(array, element) @@ -3530,7 +3529,7 @@ def list_ndims(array: Expr) -> Expr: """Returns the number of dimensions of the array. See Also: - This is an alias for :py:func:`array_ndims`. + This is an alias for :func:`~datafusion.functions.array_ndims`. """ return array_ndims(array) @@ -3553,7 +3552,7 @@ def array_push_front(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. See Also: - This is an alias for :py:func:`array_prepend`. + This is an alias for :func:`~datafusion.functions.array_prepend`. """ return array_prepend(element, array) @@ -3562,7 +3561,7 @@ def list_prepend(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. See Also: - This is an alias for :py:func:`array_prepend`. + This is an alias for :func:`~datafusion.functions.array_prepend`. """ return array_prepend(element, array) @@ -3571,7 +3570,7 @@ def list_push_front(element: Expr, array: Expr) -> Expr: """Prepends an element to the beginning of an array. See Also: - This is an alias for :py:func:`array_prepend`. + This is an alias for :func:`~datafusion.functions.array_prepend`. """ return array_prepend(element, array) @@ -3608,7 +3607,7 @@ def list_pop_back(array: Expr) -> Expr: """Returns the array without the last element. See Also: - This is an alias for :py:func:`array_pop_back`. + This is an alias for :func:`~datafusion.functions.array_pop_back`. """ return array_pop_back(array) @@ -3617,7 +3616,7 @@ def list_pop_front(array: Expr) -> Expr: """Returns the array without the first element. See Also: - This is an alias for :py:func:`array_pop_front`. + This is an alias for :func:`~datafusion.functions.array_pop_front`. """ return array_pop_front(array) @@ -3640,7 +3639,7 @@ def list_remove(array: Expr, element: Expr) -> Expr: """Removes the first element from the array equal to the given value. See Also: - This is an alias for :py:func:`array_remove`. + This is an alias for :func:`~datafusion.functions.array_remove`. """ return array_remove(array, element) @@ -3666,7 +3665,7 @@ def list_remove_n(array: Expr, element: Expr, max: Expr | int) -> Expr: """Removes the first ``max`` elements from the array equal to the given value. See Also: - This is an alias for :py:func:`array_remove_n`. + This is an alias for :func:`~datafusion.functions.array_remove_n`. """ return array_remove_n(array, element, max) @@ -3691,7 +3690,7 @@ def list_remove_all(array: Expr, element: Expr) -> Expr: """Removes all elements from the array equal to the given value. See Also: - This is an alias for :py:func:`array_remove_all`. + This is an alias for `array_remove_all`. """ return array_remove_all(array, element) @@ -3715,7 +3714,7 @@ def list_repeat(element: Expr, count: Expr | int) -> Expr: """Returns an array containing ``element`` ``count`` times. See Also: - This is an alias for :py:func:`array_repeat`. + This is an alias for :func:`~datafusion.functions.array_repeat`. """ return array_repeat(element, count) @@ -3739,7 +3738,7 @@ def list_replace(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces the first occurrence of ``from_val`` with ``to_val``. See Also: - This is an alias for :py:func:`array_replace`. + This is an alias for :func:`~datafusion.functions.array_replace`. """ return array_replace(array, from_val, to_val) @@ -3771,7 +3770,7 @@ def list_replace_n(array: Expr, from_val: Expr, to_val: Expr, max: Expr | int) - specified element. See Also: - This is an alias for :py:func:`array_replace_n`. + This is an alias for :func:`~datafusion.functions.array_replace_n`. """ return array_replace_n(array, from_val, to_val, max) @@ -3795,7 +3794,7 @@ def list_replace_all(array: Expr, from_val: Expr, to_val: Expr) -> Expr: """Replaces all occurrences of ``from_val`` with ``to_val``. See Also: - This is an alias for :py:func:`array_replace_all`. + This is an alias for `array_replace_all`. """ return array_replace_all(array, from_val, to_val) @@ -3841,7 +3840,7 @@ def list_sort(array: Expr, descending: bool = False, null_first: bool = False) - """Sorts the array. See Also: - This is an alias for :py:func:`array_sort`. + This is an alias for :func:`~datafusion.functions.array_sort`. """ return array_sort(array, descending=descending, null_first=null_first) @@ -3890,7 +3889,7 @@ def list_slice( """Returns a slice of the array. See Also: - This is an alias for :py:func:`array_slice`. + This is an alias for :func:`~datafusion.functions.array_slice`. """ return array_slice(array, begin, end, stride) @@ -3918,7 +3917,7 @@ def list_intersect(array1: Expr, array2: Expr) -> Expr: """Returns an the intersection of ``array1`` and ``array2``. See Also: - This is an alias for :py:func:`array_intersect`. + This is an alias for :func:`~datafusion.functions.array_intersect`. """ return array_intersect(array1, array2) @@ -3950,7 +3949,7 @@ def list_union(array1: Expr, array2: Expr) -> Expr: Duplicate rows will not be returned. See Also: - This is an alias for :py:func:`array_union`. + This is an alias for :func:`~datafusion.functions.array_union`. """ return array_union(array1, array2) @@ -3973,7 +3972,7 @@ def list_except(array1: Expr, array2: Expr) -> Expr: """Returns the elements that appear in ``array1`` but not in the ``array2``. See Also: - This is an alias for :py:func:`array_except`. + This is an alias for :func:`~datafusion.functions.array_except`. """ return array_except(array1, array2) @@ -4003,7 +4002,7 @@ def list_resize(array: Expr, size: Expr | int, value: Expr) -> Expr: filled with the given ``value``. See Also: - This is an alias for :py:func:`array_resize`. + This is an alias for :func:`~datafusion.functions.array_resize`. """ return array_resize(array, size, value) @@ -4026,7 +4025,7 @@ def list_any_value(array: Expr) -> Expr: """Returns the first non-null element in the array. See Also: - This is an alias for :py:func:`array_any_value`. + This is an alias for :func:`~datafusion.functions.array_any_value`. """ return array_any_value(array) @@ -4051,7 +4050,7 @@ def list_distance(array1: Expr, array2: Expr) -> Expr: """Returns the Euclidean distance between two numeric arrays. See Also: - This is an alias for :py:func:`array_distance`. + This is an alias for :func:`~datafusion.functions.array_distance`. """ return array_distance(array1, array2) @@ -4074,7 +4073,7 @@ def list_max(array: Expr) -> Expr: """Returns the maximum value in the array. See Also: - This is an alias for :py:func:`array_max`. + This is an alias for :func:`~datafusion.functions.array_max`. """ return array_max(array) @@ -4097,7 +4096,7 @@ def list_min(array: Expr) -> Expr: """Returns the minimum value in the array. See Also: - This is an alias for :py:func:`array_min`. + This is an alias for :func:`~datafusion.functions.array_min`. """ return array_min(array) @@ -4120,7 +4119,7 @@ def list_reverse(array: Expr) -> Expr: """Reverses the order of elements in the array. See Also: - This is an alias for :py:func:`array_reverse`. + This is an alias for :func:`~datafusion.functions.array_reverse`. """ return array_reverse(array) @@ -4144,7 +4143,7 @@ def list_zip(*arrays: Expr) -> Expr: """Combines multiple arrays into a single array of structs. See Also: - This is an alias for :py:func:`arrays_zip`. + This is an alias for :func:`~datafusion.functions.arrays_zip`. """ return arrays_zip(*arrays) @@ -4190,7 +4189,7 @@ def string_to_list( """Splits a string based on a delimiter and returns an array of parts. See Also: - This is an alias for :py:func:`string_to_array`. + This is an alias for :func:`~datafusion.functions.string_to_array`. """ return string_to_array(string, delimiter, null_string) @@ -4198,7 +4197,7 @@ def string_to_list( def gen_series(start: Expr, stop: Expr, step: Expr | None = None) -> Expr: """Creates a list of values in the range between start and stop. - Unlike :py:func:`range`, this includes the upper bound. + Unlike :func:`~datafusion.functions.range`, this includes the upper bound. Examples: >>> ctx = dfn.SessionContext() @@ -4226,10 +4225,10 @@ def gen_series(start: Expr, stop: Expr, step: Expr | None = None) -> Expr: def generate_series(start: Expr, stop: Expr, step: Expr | None = None) -> Expr: """Creates a list of values in the range between start and stop. - Unlike :py:func:`range`, this includes the upper bound. + Unlike :func:`~datafusion.functions.range`, this includes the upper bound. See Also: - This is an alias for :py:func:`gen_series`. + This is an alias for :func:`~datafusion.functions.gen_series`. """ return gen_series(start, stop, step) @@ -4264,7 +4263,7 @@ def empty(array: Expr) -> Expr: """Returns true if the array is empty. See Also: - This is an alias for :py:func:`array_empty`. + This is an alias for :func:`~datafusion.functions.array_empty`. """ return array_empty(array) @@ -4283,7 +4282,7 @@ def make_map(*args: Any) -> Expr: - ``make_map(k1, v1, k2, v2, ...)`` — from alternating keys and their associated values. - Keys and values that are not already :py:class:`~datafusion.expr.Expr` + Keys and values that are not already :class:`~datafusion.expr.Expr` are automatically converted to literal expressions. Examples: @@ -4416,7 +4415,7 @@ def element_at(map: Expr, key: Expr) -> Expr: Returns ``[None]`` if the key is absent. See Also: - This is an alias for :py:func:`map_extract`. + This is an alias for :func:`~datafusion.functions.map_extract`. """ return map_extract(map, key) @@ -4428,9 +4427,9 @@ def approx_distinct( ) -> Expr: """Returns the approximate number of distinct values. - This aggregate function is similar to :py:func:`count` with distinct set, but it + This aggregate function is similar to `count` with distinct set, but it will approximate the number of distinct entries. It may return significantly faster - than :py:func:`count` for some DataFrames. + than :func:`~datafusion.functions.count` for some DataFrames. If using the builder functions described in ref:`_aggregation` this function ignores the options ``order_by``, ``null_treatment``, and ``distinct``. @@ -4465,7 +4464,7 @@ def approx_distinct( def approx_median(expression: Expr, filter: Expr | None = None) -> Expr: """Returns the approximate median value. - This aggregate function is similar to :py:func:`median`, but it will only + This aggregate function is similar to `median`, but it will only approximate the median. It may return significantly faster for some DataFrames. If using the builder functions described in ref:`_aggregation` this function ignores @@ -4561,7 +4560,7 @@ def approx_percentile_cont_with_weight( ) -> Expr: """Returns the value of the weighted approximate percentile. - This aggregate function is similar to :py:func:`approx_percentile_cont` except that + This aggregate function is similar to `approx_percentile_cont` except that it uses the associated associated weights. If using the builder functions described in ref:`_aggregation` this function ignores @@ -4613,7 +4612,7 @@ def percentile_cont( ) -> Expr: """Computes the exact percentile of input values using continuous interpolation. - Unlike :py:func:`approx_percentile_cont`, this function computes the exact + Unlike `approx_percentile_cont`, this function computes the exact percentile value rather than an approximation. If using the builder functions described in ref:`_aggregation` this function ignores @@ -4655,7 +4654,7 @@ def quantile_cont( """Computes the exact percentile of input values using continuous interpolation. See Also: - This is an alias for :py:func:`percentile_cont`. + This is an alias for :func:`~datafusion.functions.percentile_cont`. """ return percentile_cont(sort_expression, percentile, filter) @@ -4669,7 +4668,7 @@ def array_agg( """Aggregate values into an array. Currently ``distinct`` and ``order_by`` cannot be used together. As a work around, - consider :py:func:`array_sort` after aggregation. + consider :func:`~datafusion.functions.array_sort` after aggregation. [Issue Tracker](https://github.com/apache/datafusion/issues/12371) If using the builder functions described in ref:`_aggregation` this function ignores @@ -4731,30 +4730,20 @@ def grouping( aggregate spans all values of that column). This function is meaningful with - :py:meth:`GroupingSet.rollup `, - :py:meth:`GroupingSet.cube `, or - :py:meth:`GroupingSet.grouping_sets `, + :meth:`~datafusion.expr.GroupingSet.rollup`, + :meth:`~datafusion.expr.GroupingSet.cube`, or + :meth:`~datafusion.expr.GroupingSet.grouping_sets`, where different rows are grouped by different subsets of columns. In a default aggregation without grouping sets every column is always part of the key, so ``grouping()`` always returns 0. - .. warning:: - - Due to an upstream DataFusion limitation - (`#21411 `_), - ``.alias()`` cannot be applied directly to a ``grouping()`` - expression. Doing so will raise an error at execution time. To - rename the column, use - :py:meth:`~datafusion.dataframe.DataFrame.with_column_renamed` - on the result DataFrame instead. - Args: expression: The column to check grouping status for distinct: If True, compute on distinct values only filter: If provided, only compute against rows for which the filter is True Examples: - With :py:meth:`~datafusion.expr.GroupingSet.rollup`, the result + With :meth:`~datafusion.expr.GroupingSet.rollup`, the result includes both per-group rows (``grouping(a) = 0``) and a grand-total row where ``a`` is aggregated across (``grouping(a) = 1``): @@ -4771,7 +4760,7 @@ def grouping( [30, 30, 60] See Also: - :py:class:`~datafusion.expr.GroupingSet` + :class:`~datafusion.expr.GroupingSet` """ filter_raw = filter.expr if filter is not None else None return Expr(f.grouping(expression.expr, distinct=distinct, filter=filter_raw)) @@ -4987,7 +4976,7 @@ def covar(value_y: Expr, value_x: Expr, filter: Expr | None = None) -> Expr: """Computes the sample covariance. See Also: - This is an alias for :py:func:`covar_samp`. + This is an alias for :func:`~datafusion.functions.covar_samp`. """ return covar_samp(value_y, value_x, filter) @@ -5028,7 +5017,7 @@ def mean(expression: Expr, filter: Expr | None = None) -> Expr: """Returns the average (mean) value of the argument. See Also: - This is an alias for :py:func:`avg`. + This is an alias for :func:`~datafusion.functions.avg`. """ return avg(expression, filter) @@ -5222,7 +5211,7 @@ def stddev_samp(arg: Expr, filter: Expr | None = None) -> Expr: """Computes the sample standard deviation of the argument. See Also: - This is an alias for :py:func:`stddev`. + This is an alias for :func:`~datafusion.functions.stddev`. """ return stddev(arg, filter=filter) @@ -5231,7 +5220,7 @@ def var(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the sample variance of the argument. See Also: - This is an alias for :py:func:`var_samp`. + This is an alias for :func:`~datafusion.functions.var_samp`. """ return var_samp(expression, filter) @@ -5272,7 +5261,7 @@ def var_population(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the population variance of the argument. See Also: - This is an alias for :py:func:`var_pop`. + This is an alias for :func:`~datafusion.functions.var_pop`. """ return var_pop(expression, filter) @@ -5313,7 +5302,7 @@ def var_sample(expression: Expr, filter: Expr | None = None) -> Expr: """Computes the sample variance of the argument. See Also: - This is an alias for :py:func:`var_samp`. + This is an alias for :func:`~datafusion.functions.var_samp`. """ return var_samp(expression, filter) @@ -6063,7 +6052,7 @@ def lead( return the 3rd following value in column ``b``. At the end of the partition, where no further values can be returned it will return the default value of 5. - Here is an example of both the ``lead`` and :py:func:`datafusion.functions.lag` + Here is an example of both the ``lead`` and :func:`~datafusion.functions.lag` functions on a simple DataFrame:: +--------+------+-----+ @@ -6139,7 +6128,7 @@ def lag( will return the 3rd previous value in column ``b``. At the beginning of the partition, where no values can be returned it will return the default value of 5. - Here is an example of both the ``lag`` and :py:func:`datafusion.functions.lead` + Here is an example of both the ``lag`` and :func:`~datafusion.functions.lead` functions on a simple DataFrame:: +--------+------+-----+ @@ -6322,7 +6311,7 @@ def dense_rank( ) -> Expr: """Create a dense_rank window function. - This window function is similar to :py:func:`rank` except that the returned values + This window function is similar to `rank` except that the returned values will be consecutive. Here is an example of a dataframe with a window ordered by descending ``points`` and the associated dense rank:: @@ -6378,7 +6367,7 @@ def percent_rank( ) -> Expr: """Create a percent_rank window function. - This window function is similar to :py:func:`rank` except that the returned values + This window function is similar to `rank` except that the returned values are the percentage from 0.0 to 1.0 from first to last. Here is an example of a dataframe with a window ordered by descending ``points`` and the associated percent rank:: @@ -6436,7 +6425,7 @@ def cume_dist( ) -> Expr: """Create a cumulative distribution window function. - This window function is similar to :py:func:`rank` except that the returned values + This window function is similar to `rank` except that the returned values are the ratio of the row number to the total number of rows. Here is an example of a dataframe with a window ordered by descending ``points`` and the associated cumulative distribution:: diff --git a/python/datafusion/input/base.py b/python/datafusion/input/base.py index f67dde2a1..52b170b12 100644 --- a/python/datafusion/input/base.py +++ b/python/datafusion/input/base.py @@ -25,6 +25,8 @@ from datafusion.common import SqlTable +__all__ = ["BaseInputSource"] + class BaseInputSource(ABC): """Base Input Source class. diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py index 779d94d23..3390d0587 100644 --- a/python/datafusion/input/location.py +++ b/python/datafusion/input/location.py @@ -23,6 +23,8 @@ from datafusion.common import DataTypeMap, SqlTable from datafusion.input.base import BaseInputSource +__all__ = ["LocationInputPlugin"] + class LocationInputPlugin(BaseInputSource): """Input Plugin for everything. diff --git a/python/datafusion/io.py b/python/datafusion/io.py index 4f9c3c516..9d72ae744 100644 --- a/python/datafusion/io.py +++ b/python/datafusion/io.py @@ -34,6 +34,14 @@ from .options import CsvReadOptions +__all__ = [ + "read_avro", + "read_csv", + "read_json", + "read_parquet", +] + + def read_parquet( path: str | pathlib.Path, table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, @@ -43,7 +51,7 @@ def read_parquet( schema: pa.Schema | None = None, file_sort_order: list[list[Expr]] | None = None, ) -> DataFrame: - """Read a Parquet source into a :py:class:`~datafusion.dataframe.Dataframe`. + """Read a Parquet source into a :class:`~datafusion.dataframe.DataFrame`. This function will use the global context. Any functions or tables registered with another context may not be accessible when used with a DataFrame created @@ -175,7 +183,7 @@ def read_avro( file_partition_cols: list[tuple[str, str | pa.DataType]] | None = None, file_extension: str = ".avro", ) -> DataFrame: - """Create a :py:class:`DataFrame` for reading Avro data source. + """Create a `DataFrame` for reading Avro data source. This function will use the global context. Any functions or tables registered with another context may not be accessible when used with a DataFrame created diff --git a/python/datafusion/ipc.py b/python/datafusion/ipc.py index 487abd4c3..71e2c2098 100644 --- a/python/datafusion/ipc.py +++ b/python/datafusion/ipc.py @@ -17,12 +17,12 @@ """Driver- and worker-side setup for distributing DataFusion expressions. -When a :class:`Expr` is shipped to a worker process (e.g. through -:func:`multiprocessing.Pool` or a Ray actor), the worker reconstructs the -expression against a :class:`SessionContext`. If the expression references +When a :class:`~datafusion.expr.Expr` is shipped to a worker process (e.g. through +`Pool` or a Ray actor), the worker reconstructs the +expression against a `SessionContext`. If the expression references UDFs imported via the FFI capsule protocol — or any UDF the worker would otherwise resolve from its registered functions rather than from inside -the shipped expression — install a configured :class:`SessionContext` +the shipped expression — install a configured `SessionContext` once per worker: .. code-block:: python @@ -42,21 +42,22 @@ def init_worker(): .. note:: Serialization model Expressions containing Python UDFs (scalar, aggregate, window) are - serialized using :mod:`cloudpickle`. The callable itself travels + serialized using `cloudpickle`. The callable itself travels **by value** (bytecode and closure cells inlined), but any names the callable resolves via ``import`` are captured **by reference** and must be importable on the receiving worker. The serialized payload is stamped with the sender's Python ``(major, minor)`` version. Loading on a different minor version - raises :class:`ValueError` with an actionable message — cloudpickle + raises :exc:`~ValueError` with an actionable message — cloudpickle payloads are not portable across Python minor versions. See - :meth:`datafusion.Expr.to_bytes` for examples of what travels by + :meth:`~datafusion.expr.Expr.to_bytes` for examples of what travels by value vs. by reference. -On the driver side, call :func:`set_sender_ctx` to control how -:func:`pickle.dumps` encodes expressions — for example, to apply -:meth:`SessionContext.with_python_udf_inlining` to every pickled +On the driver side, call +:func:`~datafusion.ipc.set_sender_ctx` to control how +:func:`~pickle.dumps` encodes expressions — for example, to apply +`with_python_udf_inlining` to every pickled expression on this thread: >>> import pickle @@ -77,12 +78,13 @@ def init_worker(): ``ctx``. The thread-local sender context holds a strong reference to the -installed :class:`SessionContext` until :func:`clear_sender_ctx` is -called or the thread exits. Long-running driver threads that install a sender -context once and never clear it will retain that session for the -lifetime of the thread; pair :func:`set_sender_ctx` with -:func:`clear_sender_ctx` (e.g. in a ``try``/``finally``) when the -sender context is only needed for a bounded scope. +installed `SessionContext` until +:func:`~datafusion.ipc.clear_sender_ctx` is called or the thread +exits. Long-running driver threads that install a sender context once and never +clear it will retain that session for the lifetime of the thread; pair +:func:`~datafusion.ipc.set_sender_ctx` with +:func:`~datafusion.ipc.clear_sender_ctx` (e.g. in a +``try``/``finally``) when the sender context is only needed for a bounded scope. """ from __future__ import annotations @@ -108,7 +110,7 @@ def init_worker(): def set_worker_ctx(ctx: SessionContext) -> None: - """Install this worker's :class:`SessionContext` for shipped expressions. + """Install this worker's `SessionContext` for shipped expressions. Call once per worker — typically from a ``multiprocessing.Pool`` initializer or a Ray actor ``__init__``. Idempotent: overwrites any @@ -127,10 +129,10 @@ def set_worker_ctx(ctx: SessionContext) -> None: def clear_worker_ctx() -> None: - """Remove this worker's installed :class:`SessionContext`. + """Remove this worker's installed `SessionContext`. After clearing, expressions reconstructed in this worker fall back to - the global :class:`SessionContext` — adequate for built-ins and Python + the global `SessionContext` — adequate for built-ins and Python UDFs (scalar, aggregate, window), but anything imported via the FFI capsule protocol must be registered on the global context to resolve. @@ -147,7 +149,7 @@ def clear_worker_ctx() -> None: def get_worker_ctx() -> SessionContext | None: - """Return this worker's installed :class:`SessionContext`, or ``None``. + """Return this worker's installed `SessionContext`, or ``None``. Examples: >>> from datafusion.ipc import get_worker_ctx, clear_worker_ctx @@ -159,18 +161,18 @@ def get_worker_ctx() -> SessionContext | None: def set_sender_ctx(ctx: SessionContext) -> None: - """Install this driver's :class:`SessionContext` for outbound pickles. + """Install this driver's `SessionContext` for outbound pickles. - Controls how :func:`pickle.dumps` encodes :class:`Expr` instances on + Controls how `dumps` encodes :class:`~datafusion.expr.Expr` instances on this thread. The most useful application is propagating a session configured with - :meth:`SessionContext.with_python_udf_inlining` so the toggle takes + `with_python_udf_inlining` so the toggle takes effect through pickle (which otherwise calls - :meth:`Expr.to_bytes` with no context and uses the default codec). + `to_bytes` with no context and uses the default codec). Idempotent: overwrites any previous value. Stored in a thread-local slot, so worker threads on the driver may install their own contexts. - Does not affect :meth:`Expr.to_bytes` calls that pass an explicit + Does not affect `to_bytes` calls that pass an explicit ``ctx`` — those continue to use the supplied context. Examples: @@ -185,7 +187,7 @@ def set_sender_ctx(ctx: SessionContext) -> None: def clear_sender_ctx() -> None: - """Remove this driver's installed sender :class:`SessionContext`. + """Remove this driver's installed sender `SessionContext`. After clearing, pickled expressions fall back to the default codec (Python UDF inlining on). @@ -205,7 +207,7 @@ def clear_sender_ctx() -> None: def get_sender_ctx() -> SessionContext | None: - """Return this driver's installed sender :class:`SessionContext`, or ``None``. + """Return this driver's installed sender `SessionContext`, or ``None``. Examples: >>> from datafusion.ipc import get_sender_ctx, clear_sender_ctx @@ -222,7 +224,7 @@ def _resolve_ctx( """Resolve a context for Expr reconstruction. Priority: explicit argument > worker context > global context. - Falling back to the global :class:`SessionContext` (instead of a + Falling back to the global `SessionContext` (instead of a freshly constructed one) preserves any registrations the user has installed on it. diff --git a/python/datafusion/plan.py b/python/datafusion/plan.py index b2c6eab3e..faccc1de0 100644 --- a/python/datafusion/plan.py +++ b/python/datafusion/plan.py @@ -103,7 +103,7 @@ def to_bytes(self, ctx: SessionContext | None = None) -> bytes: When ``ctx`` is supplied, encoding routes through the session's installed `LogicalExtensionCodec` so user FFI codecs (registered - via :py:meth:`SessionContext.with_logical_extension_codec`) see + via `with_logical_extension_codec`) see the encode path. With ``ctx=None`` a default codec is used. Tables created in memory from record batches are currently not supported. @@ -113,7 +113,7 @@ def to_bytes(self, ctx: SessionContext | None = None) -> bytes: @staticmethod def from_proto(ctx: SessionContext, data: bytes) -> LogicalPlan: - """Deprecated alias for :meth:`from_bytes`.""" + """Deprecated alias for :meth:`~datafusion.expr.Expr.from_bytes`.""" warnings.warn( "LogicalPlan.from_proto is deprecated; use from_bytes instead", DeprecationWarning, @@ -122,7 +122,7 @@ def from_proto(ctx: SessionContext, data: bytes) -> LogicalPlan: return LogicalPlan.from_bytes(ctx, data) def to_proto(self) -> bytes: - """Deprecated alias for :meth:`to_bytes`.""" + """Deprecated alias for :meth:`~datafusion.expr.Expr.to_bytes`.""" warnings.warn( "LogicalPlan.to_proto is deprecated; use to_bytes instead", DeprecationWarning, @@ -191,7 +191,7 @@ def to_bytes(self, ctx: SessionContext | None = None) -> bytes: @staticmethod def from_proto(ctx: SessionContext, data: bytes) -> ExecutionPlan: - """Deprecated alias for :meth:`from_bytes`.""" + """Deprecated alias for :meth:`~datafusion.expr.Expr.from_bytes`.""" warnings.warn( "ExecutionPlan.from_proto is deprecated; use from_bytes instead", DeprecationWarning, @@ -200,7 +200,7 @@ def from_proto(ctx: SessionContext, data: bytes) -> ExecutionPlan: return ExecutionPlan.from_bytes(ctx, data) def to_proto(self) -> bytes: - """Deprecated alias for :meth:`to_bytes`.""" + """Deprecated alias for :meth:`~datafusion.expr.Expr.to_bytes`.""" warnings.warn( "ExecutionPlan.to_proto is deprecated; use to_bytes instead", DeprecationWarning, @@ -227,16 +227,17 @@ def collect_metrics(self) -> list[tuple[str, MetricsSet]]: DataFusion executes a query as a pipeline of operators — for example a data source scan, followed by a filter, followed by a projection. After the DataFrame has been executed (via - :py:meth:`~datafusion.DataFrame.collect`, - :py:meth:`~datafusion.DataFrame.execute_stream`, etc.), each operator + :meth:`~datafusion.dataframe.DataFrame.collect`, + `execute_stream`, etc.), each operator records statistics such as how many rows it produced and how much CPU time it consumed. Each entry in the returned list corresponds to one operator that recorded metrics. The first element of the tuple is the operator's description string — the same text shown by - :py:meth:`display_indent` — which identifies both the operator type - and its key parameters, for example ``"FilterExec: column1@0 > 1"`` + :meth:`~datafusion.plan.ExecutionPlan.display_indent` — which + identifies both the operator type and its key parameters, for example + ``"FilterExec: column1@0 > 1"`` or ``"DataSourceExec: partitions=1"``. Returns: @@ -263,10 +264,11 @@ class MetricsSet: """A set of metrics for a single execution plan operator. A physical plan operator runs independently across one or more partitions. - :py:meth:`metrics` returns the raw per-partition :py:class:`Metric` objects. - The convenience properties (:py:attr:`output_rows`, :py:attr:`elapsed_compute`, - etc.) automatically sum the named metric across *all* partitions, giving a - single aggregate value for the operator as a whole. + :meth:`~datafusion.plan.MetricsSet.metrics` returns the raw per-partition + `Metric` objects. The convenience properties (`output_rows`, + :attr:`~datafusion.plan.MetricsSet.elapsed_compute`, etc.) + automatically sum the named metric across *all* partitions, giving a single + aggregate value for the operator as a whole. """ def __init__(self, raw: df_internal.MetricsSet) -> None: @@ -343,7 +345,7 @@ def value(self) -> int | datetime.datetime | None: """The value of this metric. Returns an ``int`` for counters, gauges, and time-based metrics - (nanoseconds), a :py:class:`~datetime.datetime` (UTC) for + (nanoseconds), a :class:`~datetime.datetime` (UTC) for ``start_timestamp`` / ``end_timestamp`` metrics, or ``None`` when the value has not been set or is not representable. """ @@ -351,7 +353,7 @@ def value(self) -> int | datetime.datetime | None: @property def value_as_datetime(self) -> datetime.datetime | None: - """The value as a UTC :py:class:`~datetime.datetime` for timestamp metrics. + """The value as a UTC :class:`~datetime.datetime` for timestamp metrics. Returns ``None`` for all non-timestamp metrics and for timestamp metrics whose value has not been set (e.g. before execution). diff --git a/python/datafusion/record_batch.py b/python/datafusion/record_batch.py index c24cde0ac..ebe4a38a2 100644 --- a/python/datafusion/record_batch.py +++ b/python/datafusion/record_batch.py @@ -18,7 +18,7 @@ """This module provides the classes for handling record batches. These are typically the result of dataframe -:py:func:`datafusion.dataframe.execute_stream` operations. +:func:`~datafusion.dataframe.execute_stream` operations. """ from __future__ import annotations @@ -32,18 +32,21 @@ import datafusion._internal as df_internal +__all__ = ["RecordBatch", "RecordBatchStream"] + + class RecordBatch: - """This class is essentially a wrapper for :py:class:`pa.RecordBatch`.""" + """This class is essentially a wrapper for :class:`~pyarrow.RecordBatch`.""" def __init__(self, record_batch: df_internal.RecordBatch) -> None: """This constructor is generally not called by the end user. - See the :py:class:`RecordBatchStream` iterator for generating this class. + See the `RecordBatchStream` iterator for generating this class. """ self.record_batch = record_batch def to_pyarrow(self) -> pa.RecordBatch: - """Convert to :py:class:`pa.RecordBatch`.""" + """Convert to :class:`~pyarrow.RecordBatch`.""" return self.record_batch.to_pyarrow() def __arrow_c_array__( @@ -71,7 +74,7 @@ class RecordBatchStream: """This class represents a stream of record batches. These are typically the result of a - :py:func:`~datafusion.dataframe.DataFrame.execute_stream` operation. + :meth:`~datafusion.dataframe.DataFrame.execute_stream` operation. """ def __init__(self, record_batch_stream: df_internal.RecordBatchStream) -> None: @@ -79,16 +82,21 @@ def __init__(self, record_batch_stream: df_internal.RecordBatchStream) -> None: self.rbs = record_batch_stream def next(self) -> RecordBatch: - """See :py:func:`__next__` for the iterator function.""" + """Return the next batch. + + See + :meth:`~datafusion.record_batch.RecordBatchStream.__next__` for the + iterator function. + """ return next(self) async def __anext__(self) -> RecordBatch: - """Return the next :py:class:`RecordBatch` in the stream asynchronously.""" + """Return the next `RecordBatch` in the stream asynchronously.""" next_batch = await self.rbs.__anext__() return RecordBatch(next_batch) def __next__(self) -> RecordBatch: - """Return the next :py:class:`RecordBatch` in the stream.""" + """Return the next `RecordBatch` in the stream.""" next_batch = next(self.rbs) return RecordBatch(next_batch) diff --git a/python/datafusion/substrait.py b/python/datafusion/substrait.py index 6353ef8cc..c90e4d3c0 100644 --- a/python/datafusion/substrait.py +++ b/python/datafusion/substrait.py @@ -49,8 +49,8 @@ def __init__(self, plan: substrait_internal.Plan) -> None: """Create a substrait plan. The user should not have to call this constructor directly. Rather, it - should be created via :py:class:`Serde` or py:class:`Producer` classes - in this module. + should be created via :class:`~datafusion.substrait.Serde` or + :class:`~datafusion.substrait.Producer` classes in this module. """ self.plan_internal = plan diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py index 81a516af8..42e56ac73 100644 --- a/python/datafusion/user_defined.py +++ b/python/datafusion/user_defined.py @@ -37,6 +37,26 @@ from collections.abc import Callable, Sequence +__all__ = [ + "Accumulator", + "AggregateUDF", + "AggregateUDFExportable", + "LogicalExtensionCodecExportable", + "PhysicalExtensionCodecExportable", + "ScalarUDF", + "ScalarUDFExportable", + "TableFunction", + "Volatility", + "WindowEvaluator", + "WindowUDF", + "WindowUDFExportable", + "udaf", + "udf", + "udtf", + "udwf", +] + + class Volatility(Enum): """Defines how stable or volatile a function is. @@ -128,7 +148,7 @@ def __datafusion_physical_extension_codec__(self) -> object: ... # noqa: D105 class ScalarUDF: """Class for performing scalar user-defined functions (UDF). - Scalar UDFs operate on a row by row basis. See also :py:class:`AggregateUDF` for + Scalar UDFs operate on a row by row basis. See also `AggregateUDF` for operating on a group of rows. """ @@ -142,7 +162,7 @@ def __init__( ) -> None: """Instantiate a scalar user-defined function (UDF). - See helper method :py:func:`udf` for argument details. + See helper method :func:`~datafusion.user_defined.udf` for argument details. """ if hasattr(func, "__datafusion_scalar_udf__"): self._udf = df_internal.ScalarUDF.from_pycapsule(func) @@ -157,9 +177,9 @@ def __init__( def _from_internal(cls, internal: df_internal.ScalarUDF) -> ScalarUDF: """Wrap an already-constructed internal ``ScalarUDF`` handle. - Used by :py:meth:`SessionContext.udf` to surface a function looked + Used by `udf` to surface a function looked up from the session's function registry without re-running - :py:meth:`__init__`. + :func:`~__init__`. """ wrapper = cls.__new__(cls) wrapper._udf = internal @@ -225,7 +245,7 @@ def udf( def udf(func: ScalarUDFExportable) -> ScalarUDF: ... @staticmethod - def udf(*args: Any, **kwargs: Any): # noqa: D417 + def udf(*args: Any, **kwargs: Any): """Create a new User-Defined Function (UDF). This class can be used both as either a function or a decorator. @@ -240,22 +260,24 @@ def udf(*args: Any, **kwargs: Any): # noqa: D417 When you do so, it will be assumed that the nullability of the inputs and output are True and that they have no metadata. - Args: - func (Callable, optional): Only needed when calling as a function. - Skip this argument when using `udf` as a decorator. If you have a Rust - backed ScalarUDF within a PyCapsule, you can pass this parameter - and ignore the rest. They will be determined directly from the - underlying function. See the online documentation for more information. - input_fields (list[pa.Field | pa.DataType]): The data types or Fields - of the arguments to ``func``. This list must be of the same length - as the number of arguments. - return_field (_R): The field of the return value from the function. - volatility (Volatility | str): See `Volatility` for allowed values. - name (Optional[str]): A descriptive name for the function. - - Returns: - A user-defined function that can be used in SQL expressions, - data aggregation, or window function calls. + **Parameters:** + + - `func` (`Callable`, optional): Only needed when calling as a function. + Skip this argument when using ``udf`` as a decorator. If you have a Rust + backed ScalarUDF within a PyCapsule, you can pass this parameter + and ignore the rest. They will be determined directly from the + underlying function. See the online documentation for more information. + - `input_fields` (`list[pa.Field | pa.DataType]`): The data types or Fields + of the arguments to ``func``. This list must be of the same length + as the number of arguments. + - `return_field` (`pa.Field | pa.DataType`): The field of the return value + from the function. + - `volatility` (`Volatility | str`): See + :class:`~datafusion.user_defined.Volatility` for allowed values. + - `name` (`str`, optional): A descriptive name for the function. + + **Returns:** a user-defined function that can be used in SQL expressions, + data aggregation, or window function calls. Examples: Using ``udf`` as a function: @@ -355,7 +377,7 @@ def from_pycapsule(func: ScalarUDFExportable) -> ScalarUDF: class Accumulator(metaclass=ABCMeta): - """Defines how an :py:class:`AggregateUDF` accumulates values.""" + """Defines how an `AggregateUDF` accumulates values.""" @abstractmethod def state(self) -> list[pa.Scalar]: @@ -402,7 +424,7 @@ class AggregateUDF: """Class for performing scalar user-defined functions (UDF). Aggregate UDFs operate on a group of rows and return a single value. See - also :py:class:`ScalarUDF` for operating on a row by row basis. + also `ScalarUDF` for operating on a row by row basis. """ @overload @@ -438,7 +460,7 @@ def __init__( ) -> None: """Instantiate a user-defined aggregate function (UDAF). - See :py:func:`udaf` for a convenience function and argument + See `udaf` for a convenience function and argument descriptions. """ if hasattr(accumulator, "__datafusion_aggregate_udf__"): @@ -469,9 +491,9 @@ def __init__( def _from_internal(cls, internal: df_internal.AggregateUDF) -> AggregateUDF: """Wrap an already-constructed internal ``AggregateUDF`` handle. - Used by :py:meth:`SessionContext.udaf` to surface a function looked + Used by `udaf` to surface a function looked up from the session's function registry without re-running - :py:meth:`__init__`. + :func:`~__init__`. """ wrapper = cls.__new__(cls) wrapper._udaf = internal @@ -530,7 +552,7 @@ def udaf(accum: AggregateUDFExportable) -> AggregateUDF: ... def udaf(accum: _PyCapsule) -> AggregateUDF: ... @staticmethod - def udaf(*args: Any, **kwargs: Any): # noqa: D417, C901 + def udaf(*args: Any, **kwargs: Any): # noqa: C901 """Create a new User-Defined Aggregate Function (UDAF). This class allows you to define an aggregate function that can be used in @@ -541,10 +563,10 @@ def udaf(*args: Any, **kwargs: Any): # noqa: D417, C901 - As a decorator: ``@udaf(input_types, return_type, state_type, volatility, name)``. When using ``udaf`` as a decorator, do not pass ``accum`` explicitly. - If your :py:class:`Accumulator` can be instantiated with no arguments, you + If your `Accumulator` can be instantiated with no arguments, you can simply pass its type as ``accum``. If you need to pass additional arguments to its constructor, you can define a lambda or a factory method. - During runtime the :py:class:`Accumulator` will be constructed for every + During runtime the `Accumulator` will be constructed for every instance in which this UDAF is used. Examples: @@ -607,22 +629,23 @@ def udaf(*args: Any, **kwargs: Any): # noqa: D417, C901 ... "total")[0].as_py() 16.0 - Args: - accum: The accumulator python function. Only needed when calling as a - function. Skip this argument when using ``udaf`` as a decorator. - If you have a Rust backed AggregateUDF within a PyCapsule, you can - pass this parameter and ignore the rest. They will be determined - directly from the underlying function. See the online documentation - for more information. - input_types: The data types of the arguments to ``accum``. - return_type: The data type of the return value. - state_type: The data types of the intermediate accumulation. - volatility: See :py:class:`Volatility` for allowed values. - name: A descriptive name for the function. - - Returns: - A user-defined aggregate function, which can be used in either data - aggregation or window function calls. + **Parameters:** + + - `accum`: The accumulator python function. Only needed when calling as a + function. Skip this argument when using ``udaf`` as a decorator. + If you have a Rust backed AggregateUDF within a PyCapsule, you can + pass this parameter and ignore the rest. They will be determined + directly from the underlying function. See the online documentation + for more information. + - `input_types`: The data types of the arguments to ``accum``. + - `return_type`: The data type of the return value. + - `state_type`: The data types of the intermediate accumulation. + - `volatility`: See + :class:`~datafusion.user_defined.Volatility` for allowed values. + - `name`: A descriptive name for the function. + + **Returns:** a user-defined aggregate function, which can be used in either + data aggregation or window function calls. """ # noqa: E501 W505 def _function( @@ -748,7 +771,7 @@ def get_range(self, idx: int, num_rows: int) -> tuple[int, int]: # noqa: ARG002 etc) Args: - idx:: Current index + idx: Current index num_rows: Number of rows. """ return (idx, idx + 1) @@ -762,13 +785,13 @@ def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array: This function is called once per input *partition* for window functions that *do not use* values from the window frame, such as - :py:func:`~datafusion.functions.row_number`, - :py:func:`~datafusion.functions.rank`, - :py:func:`~datafusion.functions.dense_rank`, - :py:func:`~datafusion.functions.percent_rank`, - :py:func:`~datafusion.functions.cume_dist`, - :py:func:`~datafusion.functions.lead`, - and :py:func:`~datafusion.functions.lag`. + :func:`~datafusion.functions.row_number`, + :func:`~datafusion.functions.rank`, + :func:`~datafusion.functions.dense_rank`, + :func:`~datafusion.functions.percent_rank`, + :func:`~datafusion.functions.cume_dist`, + :func:`~datafusion.functions.lead`, + and :func:`~datafusion.functions.lag`. It produces the result of all rows in a single pass. It expects to receive the entire partition as the ``value`` and @@ -874,7 +897,7 @@ class WindowUDF: """Class for performing window user-defined functions (UDF). Window UDFs operate on a partition of rows. See - also :py:class:`ScalarUDF` for operating on a row by row basis. + also `ScalarUDF` for operating on a row by row basis. """ def __init__( @@ -887,7 +910,7 @@ def __init__( ) -> None: """Instantiate a user-defined window function (UDWF). - See :py:func:`udwf` for a convenience function and argument + See `udwf` for a convenience function and argument descriptions. """ if hasattr(func, "__datafusion_window_udf__"): @@ -901,9 +924,9 @@ def __init__( def _from_internal(cls, internal: df_internal.WindowUDF) -> WindowUDF: """Wrap an already-constructed internal ``WindowUDF`` handle. - Used by :py:meth:`SessionContext.udwf` to surface a function looked + Used by `udwf` to surface a function looked up from the session's function registry without re-running - :py:meth:`__init__`. + :func:`~__init__`. """ wrapper = cls.__new__(cls) wrapper._udwf = internal @@ -952,7 +975,7 @@ def udwf( ) -> WindowUDF: ... @staticmethod - def udwf(*args: Any, **kwargs: Any): # noqa: D417 + def udwf(*args: Any, **kwargs: Any): """Create a new User-Defined Window Function (UDWF). This class can be used both as either a function or a decorator. @@ -1001,19 +1024,21 @@ def udwf(*args: Any, **kwargs: Any): # noqa: D417 >>> df.select(biased_numbers(col("a")).alias("result")).to_pydict() {'result': [10, 11, 12]} - Args: - func: Only needed when calling as a function. Skip this argument when - using ``udwf`` as a decorator. If you have a Rust backed WindowUDF - within a PyCapsule, you can pass this parameter and ignore the rest. - They will be determined directly from the underlying function. See - the online documentation for more information. - input_types: The data types of the arguments. - return_type: The data type of the return value. - volatility: See :py:class:`Volatility` for allowed values. - name: A descriptive name for the function. - - Returns: - A user-defined window function that can be used in window function calls. + **Parameters:** + + - `func`: Only needed when calling as a function. Skip this argument when + using ``udwf`` as a decorator. If you have a Rust backed WindowUDF + within a PyCapsule, you can pass this parameter and ignore the rest. + They will be determined directly from the underlying function. See + the online documentation for more information. + - `input_types`: The data types of the arguments. + - `return_type`: The data type of the return value. + - `volatility`: See + :class:`~datafusion.user_defined.Volatility` for allowed values. + - `name`: A descriptive name for the function. + + **Returns:** a user-defined window function that can be used in window + function calls. """ if hasattr(args[0], "__datafusion_window_udf__"): return WindowUDF.from_pycapsule(args[0]) @@ -1107,7 +1132,7 @@ def _wrap_session_kwarg_for_udtf(func: Callable[..., Any]) -> Callable[..., Any] The Rust call site forwards a ``datafusion._internal.SessionContext``, but UDTF authors expect to interact with the public - :class:`datafusion.SessionContext` wrapper. This closure wraps the + :class:`~datafusion.SessionContext` wrapper. This closure wraps the internal object once per call before delegating to ``func``. """ @@ -1138,7 +1163,7 @@ def __init__( """Instantiate a user-defined table function (UDTF). Set ``with_session=True`` to have the calling - :class:`SessionContext` passed as a ``session`` keyword argument + `SessionContext` passed as a ``session`` keyword argument on each invocation. Use it inside the callback to look up registered tables, UDFs, or session configuration. When ``with_session`` is ``False`` (the default), ``func`` is invoked @@ -1147,15 +1172,15 @@ def __init__( ``with_session=True`` is only supported for pure-Python callables. Passing it together with an FFI-exported table function (one exposing ``__datafusion_table_function__``) raises - :class:`TypeError`. + :exc:`~TypeError`. Registry mutations performed through the injected session (such as registering tables or UDFs) propagate to the caller's - :class:`SessionContext` because the registries are shared. + `SessionContext` because the registries are shared. Configuration changes do **not** propagate; the wrapper holds its own clone of the session config. - See :py:func:`udtf` for a convenience function and argument + See `udtf` for a convenience function and argument descriptions. """ if with_session and hasattr(func, "__datafusion_table_function__"): @@ -1194,7 +1219,7 @@ def udtf(*args: Any, with_session: bool = False, **kwargs: Any): """Create a new User-Defined Table Function (UDTF). Pass ``with_session=True`` to have the calling - :class:`SessionContext` injected as a ``session`` keyword + `SessionContext` injected as a ``session`` keyword argument on each invocation. """ if args and callable(args[0]): diff --git a/python/tests/test_docs_coverage.py b/python/tests/test_docs_coverage.py new file mode 100644 index 000000000..2c6929018 --- /dev/null +++ b/python/tests/test_docs_coverage.py @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Drift guards between the public API surface and the mkdocs reference site. + +Two checks: + +1. Every symbol in ``datafusion.__all__`` is covered by an ``mkdocstrings`` + ``:::`` directive somewhere under ``docs/source/reference/``. Coverage may + be direct (``::: datafusion.dataframe.DataFrame``) or by whole-module + autodoc (``::: datafusion.functions``). +2. Every ``:::`` directive in the reference pages resolves to a real, + importable Python object. Renames or removals that orphan a stub fail the + suite instead of silently producing an empty doc page. +""" + +from __future__ import annotations + +import importlib +import inspect +import re +from pathlib import Path + +import datafusion + +REF_DIR = Path(__file__).resolve().parents[2] / "docs" / "source" / "reference" +_DIRECTIVE_RE = re.compile(r"^:::\s+(\S+)\s*$", re.MULTILINE) + + +def _all_directives() -> set[str]: + paths: set[str] = set() + for md in REF_DIR.rglob("*.md"): + paths.update(_DIRECTIVE_RE.findall(md.read_text())) + return paths + + +def _is_covered(qual: str, directives: set[str]) -> bool: + if qual in directives: + return True + parent = qual.rsplit(".", 1)[0] if "." in qual else None + return parent in directives if parent else False + + +def test_public_api_documented() -> None: + directives = _all_directives() + assert directives, f"no ::: directives found under {REF_DIR}" + + missing: list[str] = [] + for name in datafusion.__all__: + obj = getattr(datafusion, name) + if inspect.ismodule(obj): + mod_name = obj.__name__ + if mod_name in directives or any( + d.startswith(mod_name + ".") for d in directives + ): + continue + missing.append(f"{name} (module {mod_name})") + continue + + module = getattr(obj, "__module__", None) or "datafusion" + qual = f"{module}.{name}" + if _is_covered(qual, directives) or module in directives: + continue + missing.append(f"{name} (expected '::: {qual}' or '::: {module}')") + + assert not missing, ( + "Public API symbols missing from docs/source/reference/*.md:\n " + + "\n ".join(missing) + ) + + +def test_directive_targets_resolve() -> None: + bad: list[str] = [] + for path in sorted(_all_directives()): + parts = path.split(".") + obj = None + remainder: list[str] = [] + for i in range(len(parts), 0, -1): + try: + obj = importlib.import_module(".".join(parts[:i])) + remainder = parts[i:] + break + except ImportError: + continue + if obj is None: + bad.append(f"{path} (no importable prefix)") + continue + try: + for attr in remainder: + obj = getattr(obj, attr) + except AttributeError: + bad.append(f"{path} (attribute chain broken)") + + assert not bad, ( + "Doc ::: directives reference symbols that no longer exist:\n " + + "\n ".join(bad) + ) diff --git a/uv.lock b/uv.lock index 89617aed0..77b94882d 100644 --- a/uv.lock +++ b/uv.lock @@ -8,27 +8,6 @@ resolution-markers = [ "python_full_version < '3.11'", ] -[[package]] -name = "accessible-pygments" -version = "0.0.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/bc/c1/bbac6a50d02774f91572938964c582fff4270eee73ab822a4aeea4d8b11b/accessible_pygments-0.0.5.tar.gz", hash = "sha256:40918d3e6a2b619ad424cb91e556bd3bd8865443d9f22f1dcdf79e33c8046872", size = 1377899, upload-time = "2024-05-10T11:23:10.216Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8d/3f/95338030883d8c8b91223b4e21744b04d11b161a3ef117295d8241f50ab4/accessible_pygments-0.0.5-py3-none-any.whl", hash = "sha256:88ae3211e68a1d0b011504b2ffc1691feafce124b845bd072ab6f9f66f34d4b7", size = 1395903, upload-time = "2024-05-10T11:23:08.421Z" }, -] - -[[package]] -name = "alabaster" -version = "1.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a6/f8/d9c74d0daf3f742840fd818d69cfae176fa332022fd44e3469487d5a9420/alabaster-1.0.0.tar.gz", hash = "sha256:c00dca57bca26fa62a6d7d0a9fcce65f3e026e9bfe33e9c538fd3fbb2144fd9e", size = 24210, upload-time = "2024-07-26T18:15:03.762Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/b3/6b4067be973ae96ba0d615946e314c5ae35f9f993eca561b356540bb0c2b/alabaster-1.0.0-py3-none-any.whl", hash = "sha256:fc6786402dc3fcb2de3cabd5fe455a2db534b371124f1f21de8731783dec828b", size = 13929, upload-time = "2024-07-26T18:15:02.05Z" }, -] - [[package]] name = "arro3-core" version = "0.6.5" @@ -91,27 +70,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/fd/4f8dac58ea17e05978bf35cb9a3e485b1ff3cdd6e2cc29deb08f54080de4/arro3_core-0.6.5-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:9a58acbc61480b533aa84d735db04b1e68fc7f6807ab694d606c03b5e694d83d", size = 2954405, upload-time = "2025-10-13T23:12:35.328Z" }, ] -[[package]] -name = "astroid" -version = "3.3.8" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/80/c5/5c83c48bbf547f3dd8b587529db7cf5a265a3368b33e85e76af8ff6061d3/astroid-3.3.8.tar.gz", hash = "sha256:a88c7994f914a4ea8572fac479459f4955eeccc877be3f2d959a33273b0cf40b", size = 398196, upload-time = "2024-12-24T01:13:05.59Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/07/28/0bc8a17d6cd4cc3c79ae41b7105a2b9a327c110e5ddd37a8a27b29a5c8a2/astroid-3.3.8-py3-none-any.whl", hash = "sha256:187ccc0c248bfbba564826c26f070494f7bc964fd286b6d9fff4420e55de828c", size = 275153, upload-time = "2024-12-24T01:13:02.726Z" }, -] - -[[package]] -name = "asttokens" -version = "3.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978, upload-time = "2024-11-30T04:30:14.439Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918, upload-time = "2024-11-30T04:30:10.946Z" }, -] - [[package]] name = "babel" version = "2.16.0" @@ -122,15 +80,16 @@ wheels = [ ] [[package]] -name = "beautifulsoup4" -version = "4.12.3" +name = "backrefs" +version = "7.0" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "soupsieve" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/b3/ca/824b1195773ce6166d388573fc106ce56d4a805bd7427b624e063596ec58/beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051", size = 581181, upload-time = "2024-01-17T16:53:17.902Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/a7dd63622beef68cc0d3c3c36d472e143dd95443d5ebf14cd1a5b4dfbf11/backrefs-7.0.tar.gz", hash = "sha256:4989bb9e1e99eb23647c7160ed51fb21d0b41b5d200f2d3017da41e023097e82", size = 7012453, upload-time = "2026-04-28T16:28:04.215Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b1/fe/e8c672695b37eecc5cbf43e1d0638d88d66ba3a44c4d321c796f4e59167f/beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed", size = 147925, upload-time = "2024-01-17T16:53:12.779Z" }, + { url = "https://files.pythonhosted.org/packages/d4/39/39a31d7eae729ea14ed10c3ccef79371197177b9355a86cb3525709e8502/backrefs-7.0-py310-none-any.whl", hash = "sha256:b57cd227ea556b0aed3dc9b8da4628db4eabc0402c6d7fcfc69283a93955f7e9", size = 380824, upload-time = "2026-04-28T16:27:55.647Z" }, + { url = "https://files.pythonhosted.org/packages/c9/b5/9302644225ba7dfa934a2ff2b9c7bb85701313a90dddb3dfaf693fa5bae2/backrefs-7.0-py311-none-any.whl", hash = "sha256:a0fa7360c63509e9e077e174ef4e6d3c21c8db94189b9d957289ae6d794b9475", size = 392626, upload-time = "2026-04-28T16:27:57.42Z" }, + { url = "https://files.pythonhosted.org/packages/36/da/87912ddec6e06feffbaa3d7aa18fc6352bee2e8f1fee185d7d1690f8f4e8/backrefs-7.0-py312-none-any.whl", hash = "sha256:ca42ce6a49ace3d75684dfa9937f3373902a63284ecb385ce36d15e5dcb41c12", size = 398537, upload-time = "2026-04-28T16:27:58.913Z" }, + { url = "https://files.pythonhosted.org/packages/00/bb/90ba423612b6aa0adccc6b1874bcd4a9b44b660c0c16f346611e00f64ac3/backrefs-7.0-py313-none-any.whl", hash = "sha256:f2c52955d631b9e1ac4cd56209f0a3a946d592b98e7790e77699339ae01c102a", size = 400491, upload-time = "2026-04-28T16:28:00.928Z" }, + { url = "https://files.pythonhosted.org/packages/3e/5c/fb93d3092640a24dfb7bd7727a24016d7c01774ca013e60efd3f683c8002/backrefs-7.0-py314-none-any.whl", hash = "sha256:a6448b28180e3ca01134c9cf09dcebafad8531072e09903c5451748a05f24bc9", size = 412349, upload-time = "2026-04-28T16:28:02.412Z" }, ] [[package]] @@ -269,6 +228,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0e/f6/65ecc6878a89bb1c23a086ea335ad4bf21a588990c3f535a227b9eea9108/charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85", size = 49767, upload-time = "2024-12-24T18:12:32.852Z" }, ] +[[package]] +name = "click" +version = "8.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/98/518d8e5081007684232226f475082b30087d0f585e8457db087298259f49/click-8.4.1.tar.gz", hash = "sha256:918b5633eddf6b41c32d4f454bf0de810065c74e3f7dbf8ee5452f8be88d3e96", size = 353007, upload-time = "2026-05-22T04:08:37.769Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/0d/67e5b4109ea4a837e80daa87c2c696711955e40449a97e8926672534def2/click-8.4.1-py3-none-any.whl", hash = "sha256:482be17c6991b8c19c5429a1e995d9b0efdbb63172824c41f99965dc0ade8ec2", size = 116639, upload-time = "2026-05-22T04:08:35.26Z" }, +] + [[package]] name = "cloudpickle" version = "3.1.2" @@ -360,17 +331,12 @@ dev = [ { name = "toml" }, ] docs = [ - { name = "ipython" }, - { name = "jinja2" }, - { name = "myst-parser" }, + { name = "markdown-exec", extra = ["ansi"] }, + { name = "mkdocs" }, + { name = "mkdocs-material" }, + { name = "mkdocs-redirects" }, + { name = "mkdocstrings", extra = ["python"] }, { name = "pandas" }, - { name = "pickleshare" }, - { name = "pydata-sphinx-theme" }, - { name = "setuptools" }, - { name = "sphinx" }, - { name = "sphinx-autoapi" }, - { name = "sphinx-reredirects", version = "0.1.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "sphinx-reredirects", version = "1.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] release = [ { name = "pygithub" }, @@ -402,28 +368,15 @@ dev = [ { name = "toml", specifier = ">=0.10.2" }, ] docs = [ - { name = "ipython", specifier = ">=8.12.3" }, - { name = "jinja2", specifier = ">=3.1.5" }, - { name = "myst-parser", specifier = ">=3.0.1" }, + { name = "markdown-exec", extras = ["ansi"], specifier = ">=1.10" }, + { name = "mkdocs", specifier = ">=1.6,<2" }, + { name = "mkdocs-material", specifier = ">=9.5,<10" }, + { name = "mkdocs-redirects", specifier = ">=1.2" }, + { name = "mkdocstrings", extras = ["python"], specifier = ">=0.27" }, { name = "pandas", specifier = ">=2.0.3" }, - { name = "pickleshare", specifier = ">=0.7.5" }, - { name = "pydata-sphinx-theme", specifier = ">=0.16,<0.17" }, - { name = "setuptools", specifier = ">=75.3.0" }, - { name = "sphinx", specifier = ">=7.1.2" }, - { name = "sphinx-autoapi", specifier = ">=3.4.0" }, - { name = "sphinx-reredirects", specifier = ">=0.1.5" }, ] release = [{ name = "pygithub", specifier = "==2.5.0" }] -[[package]] -name = "decorator" -version = "5.1.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/66/0c/8d907af351aa16b42caae42f9d6aa37b900c67308052d10fdce809f8d952/decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330", size = 35016, upload-time = "2022-01-07T08:20:05.666Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073, upload-time = "2022-01-07T08:20:03.734Z" }, -] - [[package]] name = "deprecated" version = "1.2.18" @@ -446,39 +399,42 @@ wheels = [ ] [[package]] -name = "docutils" -version = "0.21.2" +name = "exceptiongroup" +version = "1.2.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444, upload-time = "2024-04-23T18:57:18.24Z" } +sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883, upload-time = "2024-07-12T22:26:00.161Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" }, + { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453, upload-time = "2024-07-12T22:25:58.476Z" }, ] [[package]] -name = "exceptiongroup" -version = "1.2.2" +name = "filelock" +version = "3.18.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883, upload-time = "2024-07-12T22:26:00.161Z" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/10/c23352565a6544bdc5353e0b15fc1c563352101f30e24bf500207a54df9a/filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2", size = 18075, upload-time = "2025-03-14T07:11:40.47Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453, upload-time = "2024-07-12T22:25:58.476Z" }, + { url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215, upload-time = "2025-03-14T07:11:39.145Z" }, ] [[package]] -name = "executing" +name = "ghp-import" version = "2.1.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8c/e3/7d45f492c2c4a0e8e0fad57d081a7c8a0286cdd86372b070cca1ec0caa1e/executing-2.1.0.tar.gz", hash = "sha256:8ea27ddd260da8150fa5a708269c4a10e76161e2496ec3e587da9e3c0fe4b9ab", size = 977485, upload-time = "2024-09-01T12:37:35.708Z" } +dependencies = [ + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d9/29/d40217cbe2f6b1359e00c6c307bb3fc876ba74068cbab3dde77f03ca0dc4/ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343", size = 10943, upload-time = "2022-05-02T15:47:16.11Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b5/fd/afcd0496feca3276f509df3dbd5dae726fcc756f1a08d9e25abe1733f962/executing-2.1.0-py2.py3-none-any.whl", hash = "sha256:8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf", size = 25805, upload-time = "2024-09-01T12:37:33.007Z" }, + { url = "https://files.pythonhosted.org/packages/f7/ec/67fbef5d497f86283db54c22eec6f6140243aae73265799baaaa19cd17fb/ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619", size = 11034, upload-time = "2022-05-02T15:47:14.552Z" }, ] [[package]] -name = "filelock" -version = "3.18.0" +name = "griffelib" +version = "2.0.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0a/10/c23352565a6544bdc5353e0b15fc1c563352101f30e24bf500207a54df9a/filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2", size = 18075, upload-time = "2025-03-14T07:11:40.47Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9d/82/74f4a3310cdabfbb10da554c3a672847f1ed33c6f61dd472681ce7f1fe67/griffelib-2.0.2.tar.gz", hash = "sha256:3cf20b3bc470e83763ffbf236e0076b1211bac1bc67de13daf494640f2de707e", size = 166461, upload-time = "2026-03-27T11:34:51.091Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215, upload-time = "2025-03-14T07:11:39.145Z" }, + { url = "https://files.pythonhosted.org/packages/11/8c/c9138d881c79aa0ea9ed83cbd58d5ca75624378b38cee225dcf5c42cc91f/griffelib-2.0.2-py3-none-any.whl", hash = "sha256:925c857658fb1ba40c0772c37acbc2ab650bd794d9c1b9726922e36ea4117ea1", size = 142357, upload-time = "2026-03-27T11:34:46.275Z" }, ] [[package]] @@ -499,15 +455,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload-time = "2024-09-15T18:07:37.964Z" }, ] -[[package]] -name = "imagesize" -version = "1.4.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a7/84/62473fb57d61e31fef6e36d64a179c8781605429fd927b5dd608c997be31/imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a", size = 1280026, upload-time = "2022-07-01T12:21:05.687Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ff/62/85c4c919272577931d407be5ba5d71c20f0b616d31a0befe0ae45bb79abd/imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b", size = 8769, upload-time = "2022-07-01T12:21:02.467Z" }, -] - [[package]] name = "iniconfig" version = "2.0.0" @@ -518,61 +465,41 @@ wheels = [ ] [[package]] -name = "ipython" -version = "8.31.0" +name = "jinja2" +version = "3.1.5" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "decorator" }, - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, - { name = "jedi" }, - { name = "matplotlib-inline" }, - { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, - { name = "prompt-toolkit" }, - { name = "pygments" }, - { name = "stack-data" }, - { name = "traitlets" }, - { name = "typing-extensions", marker = "python_full_version < '3.12'" }, + { name = "markupsafe" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/01/35/6f90fdddff7a08b7b715fccbd2427b5212c9525cd043d26fdc45bee0708d/ipython-8.31.0.tar.gz", hash = "sha256:b6a2274606bec6166405ff05e54932ed6e5cfecaca1fc05f2cacde7bb074d70b", size = 5501011, upload-time = "2024-12-20T12:34:22.61Z" } +sdist = { url = "https://files.pythonhosted.org/packages/af/92/b3130cbbf5591acf9ade8708c365f3238046ac7cb8ccba6e81abccb0ccff/jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb", size = 244674, upload-time = "2024-12-21T18:30:22.828Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/04/60/d0feb6b6d9fe4ab89fe8fe5b47cbf6cd936bfd9f1e7ffa9d0015425aeed6/ipython-8.31.0-py3-none-any.whl", hash = "sha256:46ec58f8d3d076a61d128fe517a51eb730e3aaf0c184ea8c17d16e366660c6a6", size = 821583, upload-time = "2024-12-20T12:34:17.106Z" }, + { url = "https://files.pythonhosted.org/packages/bd/0f/2ba5fbcd631e3e88689309dbe978c5769e883e4b84ebfe7da30b43275c5a/jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb", size = 134596, upload-time = "2024-12-21T18:30:19.133Z" }, ] [[package]] -name = "jedi" -version = "0.19.2" +name = "markdown" +version = "3.10.2" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "parso" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2b/f4/69fa6ed85ae003c2378ffa8f6d2e3234662abd02c10d216c0ba96081a238/markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950", size = 368805, upload-time = "2026-02-09T14:57:26.942Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, + { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" }, ] [[package]] -name = "jinja2" -version = "3.1.5" +name = "markdown-exec" +version = "1.12.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markupsafe" }, + { name = "pymdown-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/af/92/b3130cbbf5591acf9ade8708c365f3238046ac7cb8ccba6e81abccb0ccff/jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb", size = 244674, upload-time = "2024-12-21T18:30:22.828Z" } +sdist = { url = "https://files.pythonhosted.org/packages/96/73/1f20927d075c83c0e2bc814d3b8f9bd254d919069f78c5423224b4407944/markdown_exec-1.12.1.tar.gz", hash = "sha256:eee8ba0df99a5400092eeda80212ba3968f3cbbf3a33f86f1cd25161538e6534", size = 78105, upload-time = "2025-11-11T19:25:05.44Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/bd/0f/2ba5fbcd631e3e88689309dbe978c5769e883e4b84ebfe7da30b43275c5a/jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb", size = 134596, upload-time = "2024-12-21T18:30:19.133Z" }, + { url = "https://files.pythonhosted.org/packages/ea/22/7b684ddb01b423b79eaba9726954bbe559540d510abc7a72a84d8eee1b26/markdown_exec-1.12.1-py3-none-any.whl", hash = "sha256:a645dce411fee297f5b4a4169c245ec51e20061d5b71e225bef006e87f3e465f", size = 38046, upload-time = "2025-11-11T19:25:03.878Z" }, ] -[[package]] -name = "markdown-it-py" -version = "3.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mdurl" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596, upload-time = "2023-06-03T06:41:14.443Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528, upload-time = "2023-06-03T06:41:11.019Z" }, +[package.optional-dependencies] +ansi = [ + { name = "pygments-ansi-color" }, ] [[package]] @@ -633,18 +560,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" }, ] -[[package]] -name = "matplotlib-inline" -version = "0.1.7" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "traitlets" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159, upload-time = "2024-04-15T13:44:44.803Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" }, -] - [[package]] name = "maturin" version = "1.13.3" @@ -670,41 +585,145 @@ wheels = [ ] [[package]] -name = "mdit-py-plugins" -version = "0.4.2" +name = "mergedeep" +version = "1.3.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3a/41/580bb4006e3ed0361b8151a01d324fb03f420815446c7def45d02f74c270/mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8", size = 4661, upload-time = "2021-02-05T18:55:30.623Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/19/04f9b178c2d8a15b076c8b5140708fa6ffc5601fb6f1e975537072df5b2a/mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307", size = 6354, upload-time = "2021-02-05T18:55:29.583Z" }, +] + +[[package]] +name = "mkdocs" +version = "1.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markdown-it-py" }, + { name = "click" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "ghp-import" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mergedeep" }, + { name = "mkdocs-get-deps" }, + { name = "packaging" }, + { name = "pathspec" }, + { name = "pyyaml" }, + { name = "pyyaml-env-tag" }, + { name = "watchdog" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/19/03/a2ecab526543b152300717cf232bb4bb8605b6edb946c845016fa9c9c9fd/mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5", size = 43542, upload-time = "2024-09-09T20:27:49.564Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bc/c6/bbd4f061bd16b378247f12953ffcb04786a618ce5e904b8c5a01a0309061/mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2", size = 3889159, upload-time = "2024-08-30T12:24:06.899Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/f7/7782a043553ee469c1ff49cfa1cdace2d6bf99a1f333cf38676b3ddf30da/mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636", size = 55316, upload-time = "2024-09-09T20:27:48.397Z" }, + { url = "https://files.pythonhosted.org/packages/22/5b/dbc6a8cddc9cfa9c4971d59fb12bb8d42e161b7e7f8cc89e49137c5b279c/mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e", size = 3864451, upload-time = "2024-08-30T12:24:05.054Z" }, ] [[package]] -name = "mdurl" -version = "0.1.2" +name = "mkdocs-autorefs" +version = "1.4.4" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +dependencies = [ + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mkdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/c0/f641843de3f612a6b48253f39244165acff36657a91cc903633d456ae1ac/mkdocs_autorefs-1.4.4.tar.gz", hash = "sha256:d54a284f27a7346b9c38f1f852177940c222da508e66edc816a0fa55fc6da197", size = 56588, upload-time = "2026-02-10T15:23:55.105Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, + { url = "https://files.pythonhosted.org/packages/28/de/a3e710469772c6a89595fc52816da05c1e164b4c866a89e3cb82fb1b67c5/mkdocs_autorefs-1.4.4-py3-none-any.whl", hash = "sha256:834ef5408d827071ad1bc69e0f39704fa34c7fc05bc8e1c72b227dfdc5c76089", size = 25530, upload-time = "2026-02-10T15:23:53.817Z" }, ] [[package]] -name = "myst-parser" -version = "4.0.0" +name = "mkdocs-get-deps" +version = "0.2.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "docutils" }, - { name = "jinja2" }, - { name = "markdown-it-py" }, - { name = "mdit-py-plugins" }, + { name = "mergedeep" }, + { name = "platformdirs" }, { name = "pyyaml" }, - { name = "sphinx" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/85/55/6d1741a1780e5e65038b74bce6689da15f620261c490c3511eb4c12bac4b/myst_parser-4.0.0.tar.gz", hash = "sha256:851c9dfb44e36e56d15d05e72f02b80da21a9e0d07cba96baf5e2d476bb91531", size = 93858, upload-time = "2024-08-05T14:02:45.798Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ce/25/b3cccb187655b9393572bde9b09261d267c3bf2f2cdabe347673be5976a6/mkdocs_get_deps-0.2.2.tar.gz", hash = "sha256:8ee8d5f316cdbbb2834bc1df6e69c08fe769a83e040060de26d3c19fad3599a1", size = 11047, upload-time = "2026-03-10T02:46:33.632Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/29/744136411e785c4b0b744d5413e56555265939ab3a104c6a4b719dad33fd/mkdocs_get_deps-0.2.2-py3-none-any.whl", hash = "sha256:e7878cbeac04860b8b5e0ca31d3abad3df9411a75a32cde82f8e44b6c16ff650", size = 9555, upload-time = "2026-03-10T02:46:32.256Z" }, +] + +[[package]] +name = "mkdocs-material" +version = "9.7.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "babel" }, + { name = "backrefs" }, + { name = "colorama" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "mkdocs" }, + { name = "mkdocs-material-extensions" }, + { name = "paginate" }, + { name = "pygments" }, + { name = "pymdown-extensions" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/45/29/6d2bcf41ae40802c4beda2432396fff97b8456fb496371d1bc7aad6512ec/mkdocs_material-9.7.6.tar.gz", hash = "sha256:00bdde50574f776d328b1862fe65daeaf581ec309bd150f7bff345a098c64a69", size = 4097959, upload-time = "2026-03-19T15:41:58.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/01/bc663630c510822c95c47a66af9fa7a443c295b47d5f041e5e6ae62ef659/mkdocs_material-9.7.6-py3-none-any.whl", hash = "sha256:71b84353921b8ea1ba84fe11c50912cc512da8fe0881038fcc9a0761c0e635ba", size = 9305470, upload-time = "2026-03-19T15:41:55.217Z" }, +] + +[[package]] +name = "mkdocs-material-extensions" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/9b/9b4c96d6593b2a541e1cb8b34899a6d021d208bb357042823d4d2cabdbe7/mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443", size = 11847, upload-time = "2023-11-22T19:09:45.208Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/54/662a4743aa81d9582ee9339d4ffa3c8fd40a4965e033d77b9da9774d3960/mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31", size = 8728, upload-time = "2023-11-22T19:09:43.465Z" }, +] + +[[package]] +name = "mkdocs-redirects" +version = "1.2.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mkdocs" }, + { name = "properdocs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/73/25/49725f78ca5d3026b09973f7a2b3a8b179cc2e8c15e43d5a13bc79f6b274/mkdocs_redirects-1.2.3.tar.gz", hash = "sha256:5e980330999299729a2d6a125347d1af78023d68a23681a4de3053ce7dfe2e51", size = 7712, upload-time = "2026-03-28T13:57:41.766Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ca/b4/b036f8fdb667587bb37df29dc6644681dd78b7a2a6321a34684b79412b28/myst_parser-4.0.0-py3-none-any.whl", hash = "sha256:b9317997552424448c6096c2558872fdb6f81d3ecb3a40ce84a7518798f3f28d", size = 84563, upload-time = "2024-08-05T14:02:43.767Z" }, + { url = "https://files.pythonhosted.org/packages/c6/90/871b1cddc01d2ba1637b858eeeabc2e3013dc8df591306b5567b98ef0870/mkdocs_redirects-1.2.3-py3-none-any.whl", hash = "sha256:ec7312fff462d03ec16395d0c001006a418f8d0c21cdf2b47ff11cf839dc3ce0", size = 6245, upload-time = "2026-03-28T13:57:40.466Z" }, +] + +[[package]] +name = "mkdocstrings" +version = "1.0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jinja2" }, + { name = "markdown" }, + { name = "markupsafe" }, + { name = "mkdocs" }, + { name = "mkdocs-autorefs" }, + { name = "pymdown-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1d/5d/f888d4d3eb31359b327bc9b17a212d6ef03fe0b0682fbb3fc2cb849fb12b/mkdocstrings-1.0.4.tar.gz", hash = "sha256:3969a6515b77db65fd097b53c1b7aa4ae840bd71a2ee62a6a3e89503446d7172", size = 100088, upload-time = "2026-04-15T09:16:53.376Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6e/94/be70f8ee9c45f2f62b39a1f0e9303bc20e138a8f3b8e50ffd89498e177e1/mkdocstrings-1.0.4-py3-none-any.whl", hash = "sha256:63464b4b29053514f32a1dbbf604e52876d5e638111b0c295ab7ed3cac73ca9b", size = 35560, upload-time = "2026-04-15T09:16:51.436Z" }, +] + +[package.optional-dependencies] +python = [ + { name = "mkdocstrings-python" }, +] + +[[package]] +name = "mkdocstrings-python" +version = "2.0.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "griffelib" }, + { name = "mkdocs-autorefs" }, + { name = "mkdocstrings" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a4/b4/5fed370d8ebd96e4e399460a7146ae989263f16588b05a6facd6dbd51e60/mkdocstrings_python-2.0.4.tar.gz", hash = "sha256:58c73c5d358e64e9b1673447663f4a2f8a8941e392e225fc0a0c893758cc452f", size = 199219, upload-time = "2026-06-05T08:13:01.819Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5e/e3/00ec594aef5f55522e6d373bc2ac53e53a8f5e9ae32f2d6854b0de4270f3/mkdocstrings_python-2.0.4-py3-none-any.whl", hash = "sha256:fd87c173e1e719a85997b6d4f852cdc55f36710e0ed08da3a7bd9abe79c9db00", size = 104790, upload-time = "2026-06-05T08:13:00.393Z" }, ] [[package]] @@ -946,6 +965,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451, upload-time = "2024-11-08T09:47:44.722Z" }, ] +[[package]] +name = "paginate" +version = "0.5.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/46/68dde5b6bc00c1296ec6466ab27dddede6aec9af1b99090e1107091b3b84/paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945", size = 19252, upload-time = "2024-08-25T14:17:24.139Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/96/04b8e52da071d28f5e21a805b19cb9390aa17a47462ac87f5e2696b9566d/paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591", size = 13746, upload-time = "2024-08-25T14:17:22.55Z" }, +] + [[package]] name = "pandas" version = "2.2.3" @@ -996,33 +1024,12 @@ wheels = [ ] [[package]] -name = "parso" -version = "0.8.4" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609, upload-time = "2024-04-05T09:43:55.897Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" }, -] - -[[package]] -name = "pexpect" -version = "4.9.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ptyprocess" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, -] - -[[package]] -name = "pickleshare" -version = "0.7.5" +name = "pathspec" +version = "1.1.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/b6/df3c1c9b616e9c0edbc4fbab6ddd09df9535849c64ba51fcb6531c32d4d8/pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", size = 6161, upload-time = "2018-09-25T19:17:37.249Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/82/42f767fc1c1143d6fd36efb827202a2d997a375e160a71eb2888a925aac1/pathspec-1.1.1.tar.gz", hash = "sha256:17db5ecd524104a120e173814c90367a96a98d07c45b2e10c2f3919fff91bf5a", size = 135180, upload-time = "2026-04-27T01:46:08.907Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/41/220f49aaea88bc6fa6cba8d05ecf24676326156c23b991e80b3f2fc24c77/pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56", size = 6877, upload-time = "2018-09-25T19:17:35.817Z" }, + { url = "https://files.pythonhosted.org/packages/f1/d9/7fb5aa316bc299258e68c73ba3bddbc499654a07f151cba08f6153988714/pathspec-1.1.1-py3-none-any.whl", hash = "sha256:a00ce642f577bf7f473932318056212bc4f8bfdf53128c78bbd5af0b9b20b189", size = 57328, upload-time = "2026-04-27T01:46:07.06Z" }, ] [[package]] @@ -1060,33 +1067,26 @@ wheels = [ ] [[package]] -name = "prompt-toolkit" -version = "3.0.48" +name = "properdocs" +version = "1.6.7" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "wcwidth" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/2d/4f/feb5e137aff82f7c7f3248267b97451da3644f6cdc218edfe549fb354127/prompt_toolkit-3.0.48.tar.gz", hash = "sha256:d6623ab0477a80df74e646bdbc93621143f5caf104206aa29294d53de1a03d90", size = 424684, upload-time = "2024-09-25T10:20:57.609Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a9/6a/fd08d94654f7e67c52ca30523a178b3f8ccc4237fce4be90d39c938a831a/prompt_toolkit-3.0.48-py3-none-any.whl", hash = "sha256:f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e", size = 386595, upload-time = "2024-09-25T10:20:53.932Z" }, -] - -[[package]] -name = "ptyprocess" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, + { name = "click" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "ghp-import" }, + { name = "jinja2" }, + { name = "markdown" }, + { name = "markupsafe" }, + { name = "packaging" }, + { name = "pathspec" }, + { name = "platformdirs" }, + { name = "pyyaml" }, + { name = "pyyaml-env-tag" }, + { name = "watchdog" }, ] - -[[package]] -name = "pure-eval" -version = "0.2.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/29/f27a4e1eddf72ed3db6e47818fbafe6debbf09fd7051f9c1a007239b46ef/properdocs-1.6.7.tar.gz", hash = "sha256:adc7b16e562890af0e098a7e5b02e3a81c20894a87d6a28d345c9300de73c26e", size = 276141, upload-time = "2026-03-20T20:07:48.167Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, + { url = "https://files.pythonhosted.org/packages/bd/4d/fc923f5c85318ee8cc903566dc4e0ebe41b2dfc1d2ecf5546db232397ed6/properdocs-1.6.7-py3-none-any.whl", hash = "sha256:6fa0cfa2e01bf338f684892c8a506cf70ea88ae7f3479c933b6fa20168101cbd", size = 225406, upload-time = "2026-03-20T20:07:46.875Z" }, ] [[package]] @@ -1155,24 +1155,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/13/a3/a812df4e2dd5696d1f351d58b8fe16a405b234ad2886a0dab9183fb78109/pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc", size = 117552, upload-time = "2024-03-30T13:22:20.476Z" }, ] -[[package]] -name = "pydata-sphinx-theme" -version = "0.16.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "accessible-pygments" }, - { name = "babel" }, - { name = "beautifulsoup4" }, - { name = "docutils" }, - { name = "pygments" }, - { name = "sphinx" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/00/20/bb50f9de3a6de69e6abd6b087b52fa2418a0418b19597601605f855ad044/pydata_sphinx_theme-0.16.1.tar.gz", hash = "sha256:a08b7f0b7f70387219dc659bff0893a7554d5eb39b59d3b8ef37b8401b7642d7", size = 2412693, upload-time = "2024-12-17T10:53:39.537Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/0d/8ba33fa83a7dcde13eb3c1c2a0c1cc29950a048bfed6d9b0d8b6bd710b4c/pydata_sphinx_theme-0.16.1-py3-none-any.whl", hash = "sha256:225331e8ac4b32682c18fcac5a57a6f717c4e632cea5dd0e247b55155faeccde", size = 6723264, upload-time = "2024-12-17T10:53:35.645Z" }, -] - [[package]] name = "pygithub" version = "2.5.0" @@ -1199,6 +1181,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293, upload-time = "2025-01-06T17:26:25.553Z" }, ] +[[package]] +name = "pygments-ansi-color" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/50/f9/7f417aaee98a74b4f757f2b72971245181fcf25d824d2e7a190345669eaf/pygments-ansi-color-0.3.0.tar.gz", hash = "sha256:7018954cf5b11d1e734383a1bafab5af613213f246109417fee3f76da26d5431", size = 7317, upload-time = "2023-05-18T22:44:35.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/17/8306a0bcd8c88d7761c2e73e831b0be026cd6873ce1f12beb3b4c9a03ffa/pygments_ansi_color-0.3.0-py3-none-any.whl", hash = "sha256:7eb063feaecadad9d4d1fd3474cbfeadf3486b64f760a8f2a00fc25392180aba", size = 10242, upload-time = "2023-05-18T22:44:34.287Z" }, +] + [[package]] name = "pyjwt" version = "2.10.1" @@ -1213,6 +1207,19 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pymdown-extensions" +version = "10.21.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/26/d1015444da4d952a1ca487a236b522eb979766f0295a0bd0c5fc089989a9/pymdown_extensions-10.21.3.tar.gz", hash = "sha256:72cfcf55f07aea0d4af2c4f11dd4e52466ddfb1bb819673146398e0bd3a77354", size = 854140, upload-time = "2026-05-13T12:57:32.267Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/85/545a951eecc270fcd688288c600017e2050a1aacb56c711d208586d3e470/pymdown_extensions-10.21.3-py3-none-any.whl", hash = "sha256:d7a5d08014fc571e80ca21dd6f854e31f94c489800350564d55d15b3c41e76b6", size = 269002, upload-time = "2026-05-13T12:57:30.296Z" }, +] + [[package]] name = "pynacl" version = "1.5.0" @@ -1359,6 +1366,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] +[[package]] +name = "pyyaml-env-tag" +version = "1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/2e/79c822141bfd05a853236b504869ebc6b70159afc570e1d5a20641782eaa/pyyaml_env_tag-1.1.tar.gz", hash = "sha256:2eb38b75a2d21ee0475d6d97ec19c63287a7e140231e4214969d0eac923cd7ff", size = 5737, upload-time = "2025-05-13T15:24:01.64Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/11/432f32f8097b03e3cd5fe57e88efb685d964e2e5178a48ed61e841f7fdce/pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04", size = 4722, upload-time = "2025-05-13T15:23:59.629Z" }, +] + [[package]] name = "requests" version = "2.32.3" @@ -1399,15 +1418,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7f/d0/578c47dd68152ddddddf31cd7fc67dc30b7cdf639a86275fda821b0d9d98/ruff-0.15.6-py3-none-win_arm64.whl", hash = "sha256:c34de3dd0b0ba203be50ae70f5910b17188556630e2178fd7d79fc030eb0d837", size = 11060497, upload-time = "2026-03-12T23:05:25.968Z" }, ] -[[package]] -name = "setuptools" -version = "75.8.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/92/ec/089608b791d210aec4e7f97488e67ab0d33add3efccb83a056cbafe3a2a6/setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6", size = 1343222, upload-time = "2025-01-08T18:28:23.98Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/69/8a/b9dc7678803429e4a3bc9ba462fa3dd9066824d3c607490235c6a796be5a/setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3", size = 1228782, upload-time = "2025-01-08T18:28:20.912Z" }, -] - [[package]] name = "six" version = "1.17.0" @@ -1417,167 +1427,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, ] -[[package]] -name = "snowballstemmer" -version = "2.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/44/7b/af302bebf22c749c56c9c3e8ae13190b5b5db37a33d9068652e8f73b7089/snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", size = 86699, upload-time = "2021-11-16T18:38:38.009Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ed/dc/c02e01294f7265e63a7315fe086dd1df7dacb9f840a804da846b96d01b96/snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a", size = 93002, upload-time = "2021-11-16T18:38:34.792Z" }, -] - -[[package]] -name = "soupsieve" -version = "2.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d7/ce/fbaeed4f9fb8b2daa961f90591662df6a86c1abf25c548329a86920aedfb/soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb", size = 101569, upload-time = "2024-08-13T13:39:12.166Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/c2/fe97d779f3ef3b15f05c94a2f1e3d21732574ed441687474db9d342a7315/soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9", size = 36186, upload-time = "2024-08-13T13:39:10.986Z" }, -] - -[[package]] -name = "sphinx" -version = "8.1.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "alabaster" }, - { name = "babel" }, - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "docutils" }, - { name = "imagesize" }, - { name = "jinja2" }, - { name = "packaging" }, - { name = "pygments" }, - { name = "requests" }, - { name = "snowballstemmer" }, - { name = "sphinxcontrib-applehelp" }, - { name = "sphinxcontrib-devhelp" }, - { name = "sphinxcontrib-htmlhelp" }, - { name = "sphinxcontrib-jsmath" }, - { name = "sphinxcontrib-qthelp" }, - { name = "sphinxcontrib-serializinghtml" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611, upload-time = "2024-10-13T20:27:13.93Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/26/60/1ddff83a56d33aaf6f10ec8ce84b4c007d9368b21008876fceda7e7381ef/sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2", size = 3487125, upload-time = "2024-10-13T20:27:10.448Z" }, -] - -[[package]] -name = "sphinx-autoapi" -version = "3.4.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "astroid" }, - { name = "jinja2" }, - { name = "pyyaml" }, - { name = "sphinx" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/4a/eb/cc243583bb1d518ca3b10998c203d919a8ed90affd4831f2b61ad09043d2/sphinx_autoapi-3.4.0.tar.gz", hash = "sha256:e6d5371f9411bbb9fca358c00a9e57aef3ac94cbfc5df4bab285946462f69e0c", size = 29292, upload-time = "2024-11-30T01:09:40.956Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/de/d6/f2acdc2567337fd5f5dc091a4e58d8a0fb14927b9779fc1e5ecee96d9824/sphinx_autoapi-3.4.0-py3-none-any.whl", hash = "sha256:4027fef2875a22c5f2a57107c71641d82f6166bf55beb407a47aaf3ef14e7b92", size = 34095, upload-time = "2024-11-30T01:09:17.272Z" }, -] - -[[package]] -name = "sphinx-reredirects" -version = "0.1.6" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.11'", -] -dependencies = [ - { name = "sphinx", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/16/6b/bcca2785de4071f604a722444d4d7ba8a9d40de3c14ad52fce93e6d92694/sphinx_reredirects-0.1.6.tar.gz", hash = "sha256:c491cba545f67be9697508727818d8626626366245ae64456fe29f37e9bbea64", size = 7080, upload-time = "2025-03-22T10:52:30.271Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/6f/0b3625be30a1a50f9e4c2cb2ec147b08f15ed0e9f8444efcf274b751300b/sphinx_reredirects-0.1.6-py3-none-any.whl", hash = "sha256:efd50c766fbc5bf40cd5148e10c00f2c00d143027de5c5e48beece93cc40eeea", size = 5675, upload-time = "2025-03-22T10:52:29.113Z" }, -] - -[[package]] -name = "sphinx-reredirects" -version = "1.1.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version >= '3.12' and python_full_version < '3.14'", - "python_full_version == '3.11.*'", -] -dependencies = [ - { name = "sphinx", marker = "python_full_version >= '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1b/8d/0e39fe2740d7d71417edf9a6424aa80ca2c27c17fc21282cdc39f90d5a40/sphinx_reredirects-1.1.0.tar.gz", hash = "sha256:fb9b195335ab14b43f8273287d0c7eeb637ba6c56c66581c11b47202f6718b29", size = 614624, upload-time = "2025-12-22T08:28:02.792Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/51/81/b5dd07067f3daac6d23687ec737b2d593740671ebcd145830c8f92d381c5/sphinx_reredirects-1.1.0-py3-none-any.whl", hash = "sha256:4b5692273c72cd2d4d917f4c6f87d5919e4d6114a752d4be033f7f5f6310efd9", size = 6351, upload-time = "2025-12-22T08:27:59.724Z" }, -] - -[[package]] -name = "sphinxcontrib-applehelp" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ba/6e/b837e84a1a704953c62ef8776d45c3e8d759876b4a84fe14eba2859106fe/sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1", size = 20053, upload-time = "2024-07-29T01:09:00.465Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5d/85/9ebeae2f76e9e77b952f4b274c27238156eae7979c5421fba91a28f4970d/sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5", size = 119300, upload-time = "2024-07-29T01:08:58.99Z" }, -] - -[[package]] -name = "sphinxcontrib-devhelp" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f6/d2/5beee64d3e4e747f316bae86b55943f51e82bb86ecd325883ef65741e7da/sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad", size = 12967, upload-time = "2024-07-29T01:09:23.417Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/35/7a/987e583882f985fe4d7323774889ec58049171828b58c2217e7f79cdf44e/sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2", size = 82530, upload-time = "2024-07-29T01:09:21.945Z" }, -] - -[[package]] -name = "sphinxcontrib-htmlhelp" -version = "2.1.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/43/93/983afd9aa001e5201eab16b5a444ed5b9b0a7a010541e0ddfbbfd0b2470c/sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9", size = 22617, upload-time = "2024-07-29T01:09:37.889Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0a/7b/18a8c0bcec9182c05a0b3ec2a776bba4ead82750a55ff798e8d406dae604/sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8", size = 98705, upload-time = "2024-07-29T01:09:36.407Z" }, -] - -[[package]] -name = "sphinxcontrib-jsmath" -version = "1.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b2/e8/9ed3830aeed71f17c026a07a5097edcf44b692850ef215b161b8ad875729/sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8", size = 5787, upload-time = "2019-01-21T16:10:16.347Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/42/4c8646762ee83602e3fb3fbe774c2fac12f317deb0b5dbeeedd2d3ba4b77/sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", size = 5071, upload-time = "2019-01-21T16:10:14.333Z" }, -] - -[[package]] -name = "sphinxcontrib-qthelp" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/68/bc/9104308fc285eb3e0b31b67688235db556cd5b0ef31d96f30e45f2e51cae/sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab", size = 17165, upload-time = "2024-07-29T01:09:56.435Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/27/83/859ecdd180cacc13b1f7e857abf8582a64552ea7a061057a6c716e790fce/sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb", size = 88743, upload-time = "2024-07-29T01:09:54.885Z" }, -] - -[[package]] -name = "sphinxcontrib-serializinghtml" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/3b/44/6716b257b0aa6bfd51a1b31665d1c205fb12cb5ad56de752dfa15657de2f/sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d", size = 16080, upload-time = "2024-07-29T01:10:09.332Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" }, -] - -[[package]] -name = "stack-data" -version = "0.6.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "asttokens" }, - { name = "executing" }, - { name = "pure-eval" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, -] - [[package]] name = "toml" version = "0.10.2" @@ -1626,15 +1475,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" }, ] -[[package]] -name = "traitlets" -version = "5.14.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, -] - [[package]] name = "typing-extensions" version = "4.12.2" @@ -1677,12 +1517,35 @@ wheels = [ ] [[package]] -name = "wcwidth" -version = "0.2.13" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301, upload-time = "2024-01-06T02:10:57.829Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166, upload-time = "2024-01-06T02:10:55.763Z" }, +name = "watchdog" +version = "6.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/db/7d/7f3d619e951c88ed75c6037b246ddcf2d322812ee8ea189be89511721d54/watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282", size = 131220, upload-time = "2024-11-01T14:07:13.037Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/56/90994d789c61df619bfc5ce2ecdabd5eeff564e1eb47512bd01b5e019569/watchdog-6.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d1cdb490583ebd691c012b3d6dae011000fe42edb7a82ece80965b42abd61f26", size = 96390, upload-time = "2024-11-01T14:06:24.793Z" }, + { url = "https://files.pythonhosted.org/packages/55/46/9a67ee697342ddf3c6daa97e3a587a56d6c4052f881ed926a849fcf7371c/watchdog-6.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bc64ab3bdb6a04d69d4023b29422170b74681784ffb9463ed4870cf2f3e66112", size = 88389, upload-time = "2024-11-01T14:06:27.112Z" }, + { url = "https://files.pythonhosted.org/packages/44/65/91b0985747c52064d8701e1075eb96f8c40a79df889e59a399453adfb882/watchdog-6.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c897ac1b55c5a1461e16dae288d22bb2e412ba9807df8397a635d88f671d36c3", size = 89020, upload-time = "2024-11-01T14:06:29.876Z" }, + { url = "https://files.pythonhosted.org/packages/e0/24/d9be5cd6642a6aa68352ded4b4b10fb0d7889cb7f45814fb92cecd35f101/watchdog-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6eb11feb5a0d452ee41f824e271ca311a09e250441c262ca2fd7ebcf2461a06c", size = 96393, upload-time = "2024-11-01T14:06:31.756Z" }, + { url = "https://files.pythonhosted.org/packages/63/7a/6013b0d8dbc56adca7fdd4f0beed381c59f6752341b12fa0886fa7afc78b/watchdog-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2", size = 88392, upload-time = "2024-11-01T14:06:32.99Z" }, + { url = "https://files.pythonhosted.org/packages/d1/40/b75381494851556de56281e053700e46bff5b37bf4c7267e858640af5a7f/watchdog-6.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afd0fe1b2270917c5e23c2a65ce50c2a4abb63daafb0d419fde368e272a76b7c", size = 89019, upload-time = "2024-11-01T14:06:34.963Z" }, + { url = "https://files.pythonhosted.org/packages/39/ea/3930d07dafc9e286ed356a679aa02d777c06e9bfd1164fa7c19c288a5483/watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948", size = 96471, upload-time = "2024-11-01T14:06:37.745Z" }, + { url = "https://files.pythonhosted.org/packages/12/87/48361531f70b1f87928b045df868a9fd4e253d9ae087fa4cf3f7113be363/watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860", size = 88449, upload-time = "2024-11-01T14:06:39.748Z" }, + { url = "https://files.pythonhosted.org/packages/5b/7e/8f322f5e600812e6f9a31b75d242631068ca8f4ef0582dd3ae6e72daecc8/watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0", size = 89054, upload-time = "2024-11-01T14:06:41.009Z" }, + { url = "https://files.pythonhosted.org/packages/68/98/b0345cabdce2041a01293ba483333582891a3bd5769b08eceb0d406056ef/watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c", size = 96480, upload-time = "2024-11-01T14:06:42.952Z" }, + { url = "https://files.pythonhosted.org/packages/85/83/cdf13902c626b28eedef7ec4f10745c52aad8a8fe7eb04ed7b1f111ca20e/watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134", size = 88451, upload-time = "2024-11-01T14:06:45.084Z" }, + { url = "https://files.pythonhosted.org/packages/fe/c4/225c87bae08c8b9ec99030cd48ae9c4eca050a59bf5c2255853e18c87b50/watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b", size = 89057, upload-time = "2024-11-01T14:06:47.324Z" }, + { url = "https://files.pythonhosted.org/packages/30/ad/d17b5d42e28a8b91f8ed01cb949da092827afb9995d4559fd448d0472763/watchdog-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c7ac31a19f4545dd92fc25d200694098f42c9a8e391bc00bdd362c5736dbf881", size = 87902, upload-time = "2024-11-01T14:06:53.119Z" }, + { url = "https://files.pythonhosted.org/packages/5c/ca/c3649991d140ff6ab67bfc85ab42b165ead119c9e12211e08089d763ece5/watchdog-6.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9513f27a1a582d9808cf21a07dae516f0fab1cf2d7683a742c498b93eedabb11", size = 88380, upload-time = "2024-11-01T14:06:55.19Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c7/ca4bf3e518cb57a686b2feb4f55a1892fd9a3dd13f470fca14e00f80ea36/watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13", size = 79079, upload-time = "2024-11-01T14:06:59.472Z" }, + { url = "https://files.pythonhosted.org/packages/5c/51/d46dc9332f9a647593c947b4b88e2381c8dfc0942d15b8edc0310fa4abb1/watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379", size = 79078, upload-time = "2024-11-01T14:07:01.431Z" }, + { url = "https://files.pythonhosted.org/packages/d4/57/04edbf5e169cd318d5f07b4766fee38e825d64b6913ca157ca32d1a42267/watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e", size = 79076, upload-time = "2024-11-01T14:07:02.568Z" }, + { url = "https://files.pythonhosted.org/packages/ab/cc/da8422b300e13cb187d2203f20b9253e91058aaf7db65b74142013478e66/watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f", size = 79077, upload-time = "2024-11-01T14:07:03.893Z" }, + { url = "https://files.pythonhosted.org/packages/2c/3b/b8964e04ae1a025c44ba8e4291f86e97fac443bca31de8bd98d3263d2fcf/watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26", size = 79078, upload-time = "2024-11-01T14:07:05.189Z" }, + { url = "https://files.pythonhosted.org/packages/62/ae/a696eb424bedff7407801c257d4b1afda455fe40821a2be430e173660e81/watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c", size = 79077, upload-time = "2024-11-01T14:07:06.376Z" }, + { url = "https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2", size = 79078, upload-time = "2024-11-01T14:07:07.547Z" }, + { url = "https://files.pythonhosted.org/packages/07/f6/d0e5b343768e8bcb4cda79f0f2f55051bf26177ecd5651f84c07567461cf/watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a", size = 79065, upload-time = "2024-11-01T14:07:09.525Z" }, + { url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070, upload-time = "2024-11-01T14:07:10.686Z" }, + { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067, upload-time = "2024-11-01T14:07:11.845Z" }, ] [[package]]