From dc26c19c7a761804f93d9e675483af41d9559c58 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Thu, 10 Mar 2022 16:25:32 +0800 Subject: [PATCH 1/6] update to release 0.5.0 (#34) --- .github/workflows/build.yml | 12 +- Cargo.lock | 2 +- Cargo.toml | 2 +- dev/create_license.py | 252 ++++++++++++++++++++++++++++++++++++ 4 files changed, 260 insertions(+), 8 deletions(-) create mode 100644 dev/create_license.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9053964..b11e3c7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -32,7 +32,7 @@ jobs: toolchain: stable override: true - name: Generate license file - run: python ../dev/create_license.py + run: python ./dev/create_license.py - uses: actions/upload-artifact@v2 with: name: python-wheel-license @@ -61,7 +61,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install maturin==0.12.6 + pip install maturin==0.12.10 - run: rm LICENSE.txt - name: Download LICENSE.txt @@ -71,7 +71,7 @@ jobs: path: . - name: Build Python package - run: maturin build --release --no-sdist --strip --locked + run: maturin build --release --strip --cargo-extra-args="--locked" - name: List Windows wheels if: matrix.os == 'windows-latest' @@ -103,10 +103,10 @@ jobs: - name: Build wheels run: | export RUSTFLAGS='-C target-cpu=skylake' - docker run --rm -v $(pwd)/..:/io \ + docker run --rm -v $(pwd):/io \ --workdir /io \ - konstin2/maturin:v0.12.6 \ - build --release --manylinux 2010 --locked + konstin2/maturin:v0.12.10 \ + build --release --manylinux 2010 --cargo-extra-args="--locked" - name: Archive wheels uses: actions/upload-artifact@v2 with: diff --git a/Cargo.lock b/Cargo.lock index ce96b94..397cf85 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -338,7 +338,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "0.4.0" +version = "0.5.0" dependencies = [ "datafusion", "datafusion-common", diff --git a/Cargo.toml b/Cargo.toml index 82e9141..ed977fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "0.4.0" +version = "0.5.0" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] diff --git a/dev/create_license.py b/dev/create_license.py new file mode 100644 index 0000000..2a67cb8 --- /dev/null +++ b/dev/create_license.py @@ -0,0 +1,252 @@ +#!/usr/bin/python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is a mirror of https://github.com/apache/arrow-datafusion/blob/master/dev/create_license.py + +import json +import subprocess + +subprocess.check_output(["cargo", "install", "cargo-license"]) +data = subprocess.check_output( + [ + "cargo", + "license", + "--avoid-build-deps", + "--avoid-dev-deps", + "--do-not-bundle", + "--json", + ] +) +data = json.loads(data) + +result = """ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +result += "\n------------------\n\n" +result += "This software is built and contains the following software:\n\n" +result += "(automatically generated via [cargo-license](https://crates.io/crates/cargo-license))\n\n" +for item in data: + license = item["license"] + name = item["name"] + version = item["version"] + repository = item["repository"] + result += "------------------\n\n" + result += f"### {name} {version}\n* source: [{repository}]({repository})\n* license: {license}\n\n" + +with open("LICENSE.txt", "w") as f: + f.write(result) From cc1649a526f8713efc6a7f5c9b307f4f3e00b89c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 14 Mar 2022 18:14:17 -0600 Subject: [PATCH 2/6] Implement select_columns (#39) --- datafusion/tests/test_dataframe.py | 10 ++++++++++ src/dataframe.rs | 6 ++++++ 2 files changed, 16 insertions(+) diff --git a/datafusion/tests/test_dataframe.py b/datafusion/tests/test_dataframe.py index e6b9ef1..99ff85a 100644 --- a/datafusion/tests/test_dataframe.py +++ b/datafusion/tests/test_dataframe.py @@ -61,6 +61,16 @@ def test_select(df): assert result.column(1) == pa.array([-3, -3, -3]) +def test_select_colums(df): + df = df.select_columns("b", "a") + + # execute and collect the first (and only) batch + result = df.collect()[0] + + assert result.column(0) == pa.array([4, 5, 6]) + assert result.column(1) == pa.array([1, 2, 3]) + + def test_filter(df): df = df.select( column("a") + column("b"), diff --git a/src/dataframe.rs b/src/dataframe.rs index 7c21102..964f042 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -51,6 +51,12 @@ impl PyDataFrame { self.df.schema().into() } + #[args(args = "*")] + fn select_columns(&self, args: Vec<&str>) -> PyResult { + let df = self.df.select_columns(&args)?; + Ok(Self::new(df)) + } + #[args(args = "*")] fn select(&self, args: Vec) -> PyResult { let expr = args.into_iter().map(|e| e.into()).collect(); From 7b336a98c097ba86c50bb485916006e053979a6c Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 15 Mar 2022 08:14:27 +0800 Subject: [PATCH 3/6] update readme and changelog (#38) * fix example import * Update README.md Co-authored-by: Andrew Lamb Co-authored-by: Andrew Lamb --- CHANGELOG.md | 137 +++++++++++++++------------------------------------ README.md | 12 +++-- 2 files changed, 50 insertions(+), 99 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a07cb00..ac2a181 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,111 +19,56 @@ # Changelog -## [python-0.4.0](https://github.com/apache/arrow-datafusion/tree/python-0.4.0) (2021-11-13) +## [Unreleased](https://github.com/datafusion-contrib/datafusion-python/tree/HEAD) -[Full Changelog](https://github.com/apache/arrow-datafusion/compare/python-0.3.0...python-0.4.0) +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0...HEAD) -**Breaking changes:** - -- Add function volatility to Signature [\#1071](https://github.com/apache/arrow-datafusion/pull/1071) [[sql](https://github.com/apache/arrow-datafusion/labels/sql)] ([pjmore](https://github.com/pjmore)) -- Make TableProvider.scan\(\) and PhysicalPlanner::create\_physical\_plan\(\) async [\#1013](https://github.com/apache/arrow-datafusion/pull/1013) ([rdettai](https://github.com/rdettai)) -- Reorganize table providers by table format [\#1010](https://github.com/apache/arrow-datafusion/pull/1010) ([rdettai](https://github.com/rdettai)) - -**Implemented enhancements:** - -- Build abi3 wheels for python binding [\#921](https://github.com/apache/arrow-datafusion/issues/921) -- Release documentation for python binding [\#837](https://github.com/apache/arrow-datafusion/issues/837) -- use arrow 6.1.0 [\#1255](https://github.com/apache/arrow-datafusion/pull/1255) ([Jimexist](https://github.com/Jimexist)) -- python `lit` function to support bool and byte vec [\#1152](https://github.com/apache/arrow-datafusion/pull/1152) ([Jimexist](https://github.com/Jimexist)) -- add python binding for `approx_distinct` aggregate function [\#1134](https://github.com/apache/arrow-datafusion/pull/1134) ([Jimexist](https://github.com/Jimexist)) -- refactor datafusion python `lit` function to allow different types [\#1130](https://github.com/apache/arrow-datafusion/pull/1130) ([Jimexist](https://github.com/Jimexist)) -- \[python\] add digest python function [\#1127](https://github.com/apache/arrow-datafusion/pull/1127) ([Jimexist](https://github.com/Jimexist)) -- \[crypto\] add `blake3` algorithm to `digest` function [\#1086](https://github.com/apache/arrow-datafusion/pull/1086) ([Jimexist](https://github.com/Jimexist)) -- \[crypto\] add blake2b and blake2s functions [\#1081](https://github.com/apache/arrow-datafusion/pull/1081) ([Jimexist](https://github.com/Jimexist)) -- fix: fix joins on Float32/Float64 columns bug [\#1054](https://github.com/apache/arrow-datafusion/pull/1054) ([francis-du](https://github.com/francis-du)) -- Update DataFusion to arrow 6.0 [\#984](https://github.com/apache/arrow-datafusion/pull/984) ([alamb](https://github.com/alamb)) -- \[Python\] Add support to perform sql query on in-memory datasource. [\#981](https://github.com/apache/arrow-datafusion/pull/981) ([mmuru](https://github.com/mmuru)) -- \[Python\] - Support show function for DataFrame api of python library [\#942](https://github.com/apache/arrow-datafusion/pull/942) ([francis-du](https://github.com/francis-du)) -- Rework the python bindings using conversion traits from arrow-rs [\#873](https://github.com/apache/arrow-datafusion/pull/873) ([kszucs](https://github.com/kszucs)) - -**Fixed bugs:** - -- Error in `python test` check / maturn python build: `function or associated item not found in `proc_macro::Literal` [\#961](https://github.com/apache/arrow-datafusion/issues/961) -- Use UUID to create unique table names in python binding [\#1111](https://github.com/apache/arrow-datafusion/pull/1111) ([hippowdon](https://github.com/hippowdon)) -- python: fix generated table name in dataframe creation [\#1078](https://github.com/apache/arrow-datafusion/pull/1078) ([houqp](https://github.com/houqp)) -- fix: joins on Timestamp columns [\#1055](https://github.com/apache/arrow-datafusion/pull/1055) ([francis-du](https://github.com/francis-du)) -- register datafusion.functions as a python package [\#995](https://github.com/apache/arrow-datafusion/pull/995) ([houqp](https://github.com/houqp)) - -**Documentation updates:** - -- python: update docs to use new APIs [\#1287](https://github.com/apache/arrow-datafusion/pull/1287) ([houqp](https://github.com/houqp)) -- Fix typo on Python functions [\#1207](https://github.com/apache/arrow-datafusion/pull/1207) ([j-a-m-l](https://github.com/j-a-m-l)) -- fix deadlink in python/readme [\#1002](https://github.com/apache/arrow-datafusion/pull/1002) ([waynexia](https://github.com/waynexia)) - -**Performance improvements:** +**Merged pull requests:** -- optimize build profile for datafusion python binding, cli and ballista [\#1137](https://github.com/apache/arrow-datafusion/pull/1137) ([houqp](https://github.com/houqp)) +- Add PyDataFrame.explain [\#36](https://github.com/datafusion-contrib/datafusion-python/pull/36) ([andygrove](https://github.com/andygrove)) +- Release 0.5.0 [\#34](https://github.com/datafusion-contrib/datafusion-python/pull/34) ([Jimexist](https://github.com/Jimexist)) +- disable nightly in workflow [\#33](https://github.com/datafusion-contrib/datafusion-python/pull/33) ([Jimexist](https://github.com/Jimexist)) +- update requirements to 37 and 310, update readme [\#32](https://github.com/datafusion-contrib/datafusion-python/pull/32) ([Jimexist](https://github.com/Jimexist)) +- Add custom global allocator [\#30](https://github.com/datafusion-contrib/datafusion-python/pull/30) ([matthewmturner](https://github.com/matthewmturner)) +- Remove pandas dependency [\#25](https://github.com/datafusion-contrib/datafusion-python/pull/25) ([matthewmturner](https://github.com/matthewmturner)) +- upgrade datafusion and pyo3 [\#20](https://github.com/datafusion-contrib/datafusion-python/pull/20) ([Jimexist](https://github.com/Jimexist)) +- update maturin 0.12+ [\#17](https://github.com/datafusion-contrib/datafusion-python/pull/17) ([Jimexist](https://github.com/Jimexist)) +- Update README.md [\#16](https://github.com/datafusion-contrib/datafusion-python/pull/16) ([Jimexist](https://github.com/Jimexist)) +- apply cargo clippy --fix [\#15](https://github.com/datafusion-contrib/datafusion-python/pull/15) ([Jimexist](https://github.com/Jimexist)) +- update test workflow to include rust clippy and check [\#14](https://github.com/datafusion-contrib/datafusion-python/pull/14) ([Jimexist](https://github.com/Jimexist)) +- use maturin 0.12.6 [\#13](https://github.com/datafusion-contrib/datafusion-python/pull/13) ([Jimexist](https://github.com/Jimexist)) +- apply cargo fmt [\#12](https://github.com/datafusion-contrib/datafusion-python/pull/12) ([Jimexist](https://github.com/Jimexist)) +- use stable not nightly [\#11](https://github.com/datafusion-contrib/datafusion-python/pull/11) ([Jimexist](https://github.com/Jimexist)) +- ci: test against more compilers, setup clippy and fix clippy lints [\#9](https://github.com/datafusion-contrib/datafusion-python/pull/9) ([cpcloud](https://github.com/cpcloud)) +- Fix use of importlib.metadata and unify requirements.txt [\#8](https://github.com/datafusion-contrib/datafusion-python/pull/8) ([cpcloud](https://github.com/cpcloud)) +- Ship the Cargo.lock file in the source distribution [\#7](https://github.com/datafusion-contrib/datafusion-python/pull/7) ([cpcloud](https://github.com/cpcloud)) +- add \_\_version\_\_ attribute to datafusion object [\#3](https://github.com/datafusion-contrib/datafusion-python/pull/3) ([tfeda](https://github.com/tfeda)) +- fix ci by fixing directories [\#2](https://github.com/datafusion-contrib/datafusion-python/pull/2) ([Jimexist](https://github.com/Jimexist)) +- setup workflow [\#1](https://github.com/datafusion-contrib/datafusion-python/pull/1) ([Jimexist](https://github.com/Jimexist)) + +## [0.5.0](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0) (2022-03-10) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc2...0.5.0) + +## [0.5.0-rc2](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc2) (2022-03-10) + +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/0.5.0-rc1...0.5.0-rc2) **Closed issues:** -- InList expr with NULL literals do not work [\#1190](https://github.com/apache/arrow-datafusion/issues/1190) -- update the homepage README to include values, `approx_distinct`, etc. [\#1171](https://github.com/apache/arrow-datafusion/issues/1171) -- \[Python\]: Inconsistencies with Python package name [\#1011](https://github.com/apache/arrow-datafusion/issues/1011) -- Wanting to contribute to project where to start? [\#983](https://github.com/apache/arrow-datafusion/issues/983) -- delete redundant code [\#973](https://github.com/apache/arrow-datafusion/issues/973) -- \[Python\]: register custom datasource [\#906](https://github.com/apache/arrow-datafusion/issues/906) -- How to build DataFusion python wheel [\#853](https://github.com/apache/arrow-datafusion/issues/853) -- Produce a design for a metrics framework [\#21](https://github.com/apache/arrow-datafusion/issues/21) - +- Add support for Ballista [\#37](https://github.com/datafusion-contrib/datafusion-python/issues/37) +- Implement DataFrame.explain [\#35](https://github.com/datafusion-contrib/datafusion-python/issues/35) -For older versions, see [apache/arrow/CHANGELOG.md](https://github.com/apache/arrow/blob/master/CHANGELOG.md) +## [0.5.0-rc1](https://github.com/datafusion-contrib/datafusion-python/tree/0.5.0-rc1) (2022-03-09) -## [python-0.3.0](https://github.com/apache/arrow-datafusion/tree/python-0.3.0) (2021-08-10) - -[Full Changelog](https://github.com/apache/arrow-datafusion/compare/4.0.0...python-0.3.0) - -**Implemented enhancements:** - -- add more math functions and unit tests to `python` crate [\#748](https://github.com/apache/arrow-datafusion/pull/748) ([Jimexist](https://github.com/Jimexist)) -- Expose ExecutionContext.register\_csv to the python bindings [\#524](https://github.com/apache/arrow-datafusion/pull/524) ([kszucs](https://github.com/kszucs)) -- Implement missing join types for Python dataframe [\#503](https://github.com/apache/arrow-datafusion/pull/503) ([Dandandan](https://github.com/Dandandan)) -- Add missing functions to python [\#388](https://github.com/apache/arrow-datafusion/pull/388) ([jgoday](https://github.com/jgoday)) - -**Fixed bugs:** - -- fix maturin version in pyproject.toml [\#756](https://github.com/apache/arrow-datafusion/pull/756) ([Jimexist](https://github.com/Jimexist)) -- fix pyarrow type id mapping in `python` crate [\#742](https://github.com/apache/arrow-datafusion/pull/742) ([Jimexist](https://github.com/Jimexist)) +[Full Changelog](https://github.com/datafusion-contrib/datafusion-python/compare/4c98b8e9c3c3f8e2e6a8f2d1ffcfefda344c4680...0.5.0-rc1) **Closed issues:** -- Confirm git tagging strategy for releases [\#770](https://github.com/apache/arrow-datafusion/issues/770) -- arrow::util::pretty::pretty\_format\_batches missing [\#769](https://github.com/apache/arrow-datafusion/issues/769) -- move the `assert_batches_eq!` macros to a non part of datafusion [\#745](https://github.com/apache/arrow-datafusion/issues/745) -- fix an issue where aliases are not respected in generating downstream schemas in window expr [\#592](https://github.com/apache/arrow-datafusion/issues/592) -- make the planner to print more succinct and useful information in window function explain clause [\#526](https://github.com/apache/arrow-datafusion/issues/526) -- move window frame module to be in `logical_plan` [\#517](https://github.com/apache/arrow-datafusion/issues/517) -- use a more rust idiomatic way of handling nth\_value [\#448](https://github.com/apache/arrow-datafusion/issues/448) -- create a test with more than one partition for window functions [\#435](https://github.com/apache/arrow-datafusion/issues/435) -- Implement hash-partitioned hash aggregate [\#27](https://github.com/apache/arrow-datafusion/issues/27) -- Consider using GitHub pages for DataFusion/Ballista documentation [\#18](https://github.com/apache/arrow-datafusion/issues/18) -- Update "repository" in Cargo.toml [\#16](https://github.com/apache/arrow-datafusion/issues/16) - -**Merged pull requests:** - -- fix python binding for `concat`, `concat_ws`, and `random` [\#768](https://github.com/apache/arrow-datafusion/pull/768) ([Jimexist](https://github.com/Jimexist)) -- fix 226, make `concat`, `concat_ws`, and `random` work with `Python` crate [\#761](https://github.com/apache/arrow-datafusion/pull/761) ([Jimexist](https://github.com/Jimexist)) -- fix python crate with the changes to logical plan builder [\#650](https://github.com/apache/arrow-datafusion/pull/650) ([Jimexist](https://github.com/Jimexist)) -- use nightly nightly-2021-05-10 [\#536](https://github.com/apache/arrow-datafusion/pull/536) ([Jimexist](https://github.com/Jimexist)) -- Define the unittests using pytest [\#493](https://github.com/apache/arrow-datafusion/pull/493) ([kszucs](https://github.com/kszucs)) -- use requirements.txt to formalize python deps [\#484](https://github.com/apache/arrow-datafusion/pull/484) ([Jimexist](https://github.com/Jimexist)) -- update cargo.toml in python crate and fix unit test due to hash joins [\#483](https://github.com/apache/arrow-datafusion/pull/483) ([Jimexist](https://github.com/Jimexist)) -- simplify python function definitions [\#477](https://github.com/apache/arrow-datafusion/pull/477) ([Jimexist](https://github.com/Jimexist)) -- Expose DataFrame::sort in the python bindings [\#469](https://github.com/apache/arrow-datafusion/pull/469) ([kszucs](https://github.com/kszucs)) -- Revert "Revert "Add datafusion-python \(\#69\)" \(\#257\)" [\#270](https://github.com/apache/arrow-datafusion/pull/270) ([andygrove](https://github.com/andygrove)) -- Revert "Add datafusion-python \(\#69\)" [\#257](https://github.com/apache/arrow-datafusion/pull/257) ([andygrove](https://github.com/andygrove)) -- update arrow-rs deps to latest master [\#216](https://github.com/apache/arrow-datafusion/pull/216) ([alamb](https://github.com/alamb)) -- Add datafusion-python [\#69](https://github.com/apache/arrow-datafusion/pull/69) ([jorgecarleitao](https://github.com/jorgecarleitao)) - - +- Investigate exposing additional optimizations [\#28](https://github.com/datafusion-contrib/datafusion-python/issues/28) +- Use custom allocator in Python build [\#27](https://github.com/datafusion-contrib/datafusion-python/issues/27) +- Why is pandas a requirement? [\#24](https://github.com/datafusion-contrib/datafusion-python/issues/24) +- Unable to build [\#18](https://github.com/datafusion-contrib/datafusion-python/issues/18) +- Setup CI against multiple Python version [\#6](https://github.com/datafusion-contrib/datafusion-python/issues/6) -\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* +\* _This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)_ diff --git a/README.md b/README.md index 0f7bd3a..d4b953f 100644 --- a/README.md +++ b/README.md @@ -39,11 +39,9 @@ Simple usage: ```python import datafusion +from datafusion import functions as f import pyarrow -# an alias -f = datafusion.functions - # create a context ctx = datafusion.ExecutionContext() @@ -125,6 +123,14 @@ pip install datafusion python -m pip install datafusion ``` +You can verify the installation by running: + +```python +>>> import datafusion +>>> datafusion.__version__ +'0.5.0' +``` + ## How to develop This assumes that you have rust and cargo installed. We use the workflow recommended by [pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). From e24d59ca7390b2f93b6cb05cbc3c448176c409ce Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 15 Mar 2022 10:09:49 +0800 Subject: [PATCH 4/6] fix demo in readme (#40) --- README.md | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index d4b953f..46e6429 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ Simple usage: ```python import datafusion from datafusion import functions as f +from datafusion import col import pyarrow # create a context @@ -54,8 +55,8 @@ df = ctx.create_dataframe([[batch]]) # create a new statement df = df.select( - f.col("a") + f.col("b"), - f.col("a") - f.col("b"), + col("a") + col("b"), + col("a") - col("b"), ) # execute and collect the first (and only) batch @@ -68,12 +69,18 @@ assert result.column(1) == pyarrow.array([-3, -3, -3]) ### UDFs ```python +from datafusion import udf + def is_null(array: pyarrow.Array) -> pyarrow.Array: return array.is_null() -udf = f.udf(is_null, [pyarrow.int64()], pyarrow.bool_()) +is_null_arr = udf(is_null, [pyarrow.int64()], pyarrow.bool_(), 'stable') + +df = df.select(is_null_arr(col("a"))) + +result = df.collect() -df = df.select(udf(f.col("a"))) +assert result.column(0) == pyarrow.array([False] * 3) ``` ### UDAF @@ -81,18 +88,16 @@ df = df.select(udf(f.col("a"))) ```python import pyarrow import pyarrow.compute +from datafusion import udaf, Accumulator -class Accumulator: +class MyAccumulator(Accumulator): """ Interface of a user-defined accumulation. """ def __init__(self): self._sum = pyarrow.scalar(0.0) - def to_scalars(self) -> [pyarrow.Scalar]: - return [self._sum] - def update(self, values: pyarrow.Array) -> None: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(values).as_py()) @@ -101,18 +106,25 @@ class Accumulator: # not nice since pyarrow scalars can't be summed yet. This breaks on `None` self._sum = pyarrow.scalar(self._sum.as_py() + pyarrow.compute.sum(states).as_py()) + def state(self) -> pyarrow.Array: + return pyarrow.array([self._sum.as_py()]) + def evaluate(self) -> pyarrow.Scalar: return self._sum -df = ... +df = ctx.create_dataframe([[batch]]) -udaf = f.udaf(Accumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()]) +my_udaf = udaf(MyAccumulator, pyarrow.float64(), pyarrow.float64(), [pyarrow.float64()], 'stable') df = df.aggregate( [], - [udaf(f.col("a"))] + [my_udaf(col("a"))] ) + +result = df.collect()[0] + +assert result.column(0) == pyarrow.array([6.0]) ``` ## How to install (from pip) From cbf684036d0664bf0372b37b612dd94fdb4c919f Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 15 Mar 2022 10:10:03 +0800 Subject: [PATCH 5/6] use __getitem__ for df column selection (#41) * use __getitem__ for df column selection * add python test --- Cargo.toml | 2 +- datafusion/tests/test_indexing.py | 55 +++++++++++++++++++++++++++++++ src/dataframe.rs | 36 ++++++++++++++++---- 3 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 datafusion/tests/test_indexing.py diff --git a/Cargo.toml b/Cargo.toml index ed977fc..5733845 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,7 @@ rust-version = "1.57" [dependencies] tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.7" -pyo3 = { version = "0.15", features = ["extension-module", "abi3", "abi3-py36"] } +pyo3 = { version = "~0.15", features = ["extension-module", "abi3", "abi3-py36"] } datafusion = { version = "^7.0.0", features = ["pyarrow"] } datafusion-expr = { version = "^7.0.0" } datafusion-common = { version = "^7.0.0", features = ["pyarrow"] } diff --git a/datafusion/tests/test_indexing.py b/datafusion/tests/test_indexing.py new file mode 100644 index 0000000..6250e4b --- /dev/null +++ b/datafusion/tests/test_indexing.py @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +import pytest + +from datafusion import ExecutionContext + + +@pytest.fixture +def df(): + ctx = ExecutionContext() + + # create a RecordBatch and a new DataFrame from it + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 4, 6])], + names=["a", "b"], + ) + return ctx.create_dataframe([[batch]]) + + +def test_indexing(df): + assert df["a"] is not None + assert df["a", "b"] is not None + assert df[("a", "b")] is not None + assert df[["a"]] is not None + + +def test_err(df): + with pytest.raises(Exception) as e_info: + df["c"] + + assert "No field with unqualified name" in e_info.value.args[0] + + with pytest.raises(Exception) as e_info: + df[1] + + assert ( + "DataFrame can only be indexed by string index or indices" + in e_info.value.args[0] + ) diff --git a/src/dataframe.rs b/src/dataframe.rs index 964f042..c73b587 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -15,18 +15,18 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; - -use pyo3::prelude::*; - +use crate::utils::wait_for_future; +use crate::{errors::DataFusionError, expression::PyExpr}; use datafusion::arrow::datatypes::Schema; use datafusion::arrow::pyarrow::PyArrowConvert; use datafusion::arrow::util::pretty; use datafusion::dataframe::DataFrame; use datafusion::logical_plan::JoinType; - -use crate::utils::wait_for_future; -use crate::{errors::DataFusionError, expression::PyExpr}; +use pyo3::exceptions::PyTypeError; +use pyo3::mapping::PyMappingProtocol; +use pyo3::prelude::*; +use pyo3::types::PyTuple; +use std::sync::Arc; /// A PyDataFrame is a representation of a logical plan and an API to compose statements. /// Use it to build a plan and `.collect()` to execute the plan and collect the result. @@ -142,3 +142,25 @@ impl PyDataFrame { Ok(pretty::print_batches(&batches)?) } } + +#[pyproto] +impl PyMappingProtocol<'_> for PyDataFrame { + fn __getitem__(&self, key: PyObject) -> PyResult { + Python::with_gil(|py| { + if let Ok(key) = key.extract::<&str>(py) { + self.select_columns(vec![key]) + } else if let Ok(tuple) = key.extract::<&PyTuple>(py) { + let keys = tuple + .iter() + .map(|item| item.extract::<&str>()) + .collect::>>()?; + self.select_columns(keys) + } else if let Ok(keys) = key.extract::>(py) { + self.select_columns(keys) + } else { + let message = "DataFrame can only be indexed by string index or indices"; + Err(PyTypeError::new_err(message)) + } + }) + } +} From caa48adf508e1997339ff39997afc4353688a7a6 Mon Sep 17 00:00:00 2001 From: Jiayu Liu Date: Tue, 15 Mar 2022 10:13:04 +0800 Subject: [PATCH 6/6] release 0.5.1 --- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 397cf85..20d8e39 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -338,7 +338,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "0.5.0" +version = "0.5.1" dependencies = [ "datafusion", "datafusion-common", diff --git a/Cargo.toml b/Cargo.toml index 5733845..a3abe04 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "0.5.0" +version = "0.5.1" homepage = "https://github.com/apache/arrow" repository = "https://github.com/apache/arrow" authors = ["Apache Arrow "] diff --git a/README.md b/README.md index 46e6429..2ad97ad 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ You can verify the installation by running: ```python >>> import datafusion >>> datafusion.__version__ -'0.5.0' +'0.5.1' ``` ## How to develop